diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ae78af1..7e2f8f1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -4,10 +4,31 @@ on:
   push:
     branches:
       - main
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+      - encodec-submodule-fix-ci
+    paths:
+      [
+        ".github/workflows/**",
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+      ]
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', ".github/workflows/**"]
+    paths:
+      [
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+        ".github/workflows/**",
+      ]
 
 env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -24,7 +45,7 @@ jobs:
         id: checkout
         uses: actions/checkout@v4
         with:
-          submodules: true
+          submodules: recursive
 
       - name: Dependencies
         id: depends
@@ -35,6 +56,7 @@ jobs:
       - name: Build
         id: cmake_build
         run: |
+          cd bark
           mkdir build
           cd build
           cmake ..
@@ -48,7 +70,7 @@ jobs:
         id: checkout
         uses: actions/checkout@v4
         with:
-          submodules: true
+          submodules: recursive
 
       - name: Dependencies
         id: depends
@@ -60,6 +82,7 @@ jobs:
         id: cmake_build
         run: |
           sysctl -a
+          cd bark
           mkdir build
           cd build
           cmake ..
diff --git a/.gitmodules b/.gitmodules
index f76ad7d..6629b72 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "ggml"]
-	path = ggml
-	url = https://github.com/ggerganov/ggml.git
+[submodule "encodec.cpp"]
+	path = encodec.cpp
+	url = https://github.com/PABannier/encodec.cpp
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 95e18ea..0d930c2 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -71,6 +71,13 @@
         "algorithm": "cpp",
         "bit": "cpp",
         "cinttypes": "cpp",
-        "codecvt": "cpp"
-    }
+        "codecvt": "cpp",
+        "any": "cpp",
+        "forward_list": "cpp",
+        "ranges": "cpp",
+        "set": "cpp",
+        "span": "cpp",
+        "valarray": "cpp"
+    },
+    "cmake.sourceDirectory": "/Users/pbannier/Documents/bark.cpp/bark"
 }
\ No newline at end of file
diff --git a/README.md b/README.md
index 2b9d088..0a481ce 100644
--- a/README.md
+++ b/README.md
@@ -9,155 +9,83 @@
 
 Inference of [SunoAI's bark model](https://github.com/suno-ai/bark) in pure C/C++.
 
-**Disclaimer: there remains bug in the inference code, bark is able to generate audio for some prompts or some seeds,
-but it does not work for most prompts. The current effort of the community is to fix those bugs, in order to release
-v0.0.2**.
-
 ## Description
 
-The main goal of `bark.cpp` is to synthesize audio from a textual input with the [Bark](https://github.com/suno-ai/bark) model in efficiently using only CPU.
+With `bark.cpp`, my goal is to bring **real-time realistic multilingual** text-to-speech generation to the community. Currently, I am focused on porting the [Bark](https://github.com/suno-ai/bark) model in C++.
 
-- [X] Plain C/C++ implementation without dependencies
-- [X] AVX, AVX2 and AVX512 for x86 architectures
-- [X] Mixed F16 / F32 precision
-- [X] 4-bit, 5-bit and 8-bit integer quantization
-- [ ] Optimized via ARM NEON, Accelerate and Metal frameworks
-- [ ] iOS on-device deployment using CoreML
+- [x] Plain C/C++ implementation without dependencies
+- [x] AVX, AVX2 and AVX512 for x86 architectures
+- [x] CPU and GPU compatible backends
+- [x] Mixed F16 / F32 precision
+- [x] 4-bit, 5-bit and 8-bit integer quantization
+- [x] Metal and CUDA backends
 
 The original implementation of `bark.cpp` is the bark's 24Khz English model. We expect to support multiple encoders in the future (see [this](https://github.com/PABannier/bark.cpp/issues/36) and [this](https://github.com/PABannier/bark.cpp/issues/6)), as well as music generation model (see [this](https://github.com/PABannier/bark.cpp/issues/62)). This project is for educational purposes.
 
 Demo on [Google Colab](https://colab.research.google.com/drive/1JVtJ6CDwxtKfFmEd8J4FGY2lzdL0d0jT?usp=sharing) ([#95](https://github.com/PABannier/bark.cpp/issues/95))
 
-**Supported platforms:**
+---
 
-- [X] Mac OS
-- [X] Linux
-- [X] Windows
+Here is a typical run using `bark.cpp`:
 
-**Supported models:**
+```java
+make -j && ./main -p "This is an audio generated by bark.cpp"
 
-- [X] Bark
-- [ ] Vocos
-- [ ] AudioCraft
+   __               __
+   / /_  ____ ______/ /__        _________  ____
+  / __ \/ __ `/ ___/ //_/       / ___/ __ \/ __ \
+ / /_/ / /_/ / /  / ,<    _    / /__/ /_/ / /_/ /
+/_.___/\__,_/_/  /_/|_|  (_)   \___/ .___/ .___/
+                                  /_/   /_/
 
----
 
-Here are typical audio pieces generated by `bark.cpp`:
+bark_tokenize_input: prompt: 'this is a dog barking.'
+bark_tokenize_input: number of tokens in prompt = 513, first 8 tokens: 20579 20172 10217 27883 28169 25677 10167 129595
 
-https://github.com/PABannier/bark.cpp/assets/12958149/f9f240fd-975f-4d69-9bb3-b295a61daaff
+Generating semantic tokens: [========>                                          ] (17%)
 
-https://github.com/PABannier/bark.cpp/assets/12958149/c0caadfd-bed9-4a48-8c17-3215963facc1
+bark_print_statistics: mem per token =     0.00 MB
+bark_print_statistics:   sample time =     9.90 ms / 138 tokens
+bark_print_statistics:  predict time =  3163.78 ms / 22.92 ms per token
+bark_print_statistics:    total time =  3188.37 ms
 
-Here is a typical run using Bark:
+Generating coarse tokens: [==================================================>] (100%)
 
-```java
-make -j && ./main -p "this is an audio"
-I bark.cpp build info:
-I UNAME_S:  Darwin
-I UNAME_P:  arm
-I UNAME_M:  arm64
-I CFLAGS:   -I. -O3 -std=c11   -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_ACCELERATE
-I CXXFLAGS: -I. -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread
-I LDFLAGS:   -framework Accelerate
-I CC:       Apple clang version 14.0.0 (clang-1400.0.29.202)
-I CXX:      Apple clang version 14.0.0 (clang-1400.0.29.202)
-
-bark_model_load: loading model from './ggml_weights'
-bark_model_load: reading bark text model
-gpt_model_load: n_in_vocab  = 129600
-gpt_model_load: n_out_vocab = 10048
-gpt_model_load: block_size  = 1024
-gpt_model_load: n_embd      = 1024
-gpt_model_load: n_head      = 16
-gpt_model_load: n_layer     = 24
-gpt_model_load: n_lm_heads  = 1
-gpt_model_load: n_wtes      = 1
-gpt_model_load: ggml tensor size = 272 bytes
-gpt_model_load: ggml ctx size = 1894.87 MB
-gpt_model_load: memory size =   192.00 MB, n_mem = 24576
-gpt_model_load: model size  =  1701.69 MB
-bark_model_load: reading bark vocab
-
-bark_model_load: reading bark coarse model
-gpt_model_load: n_in_vocab  = 12096
-gpt_model_load: n_out_vocab = 12096
-gpt_model_load: block_size  = 1024
-gpt_model_load: n_embd      = 1024
-gpt_model_load: n_head      = 16
-gpt_model_load: n_layer     = 24
-gpt_model_load: n_lm_heads  = 1
-gpt_model_load: n_wtes      = 1
-gpt_model_load: ggml tensor size = 272 bytes
-gpt_model_load: ggml ctx size = 1443.87 MB
-gpt_model_load: memory size =   192.00 MB, n_mem = 24576
-gpt_model_load: model size  =  1250.69 MB
-
-bark_model_load: reading bark fine model
-gpt_model_load: n_in_vocab  = 1056
-gpt_model_load: n_out_vocab = 1056
-gpt_model_load: block_size  = 1024
-gpt_model_load: n_embd      = 1024
-gpt_model_load: n_head      = 16
-gpt_model_load: n_layer     = 24
-gpt_model_load: n_lm_heads  = 7
-gpt_model_load: n_wtes      = 8
-gpt_model_load: ggml tensor size = 272 bytes
-gpt_model_load: ggml ctx size = 1411.25 MB
-gpt_model_load: memory size =   192.00 MB, n_mem = 24576
-gpt_model_load: model size  =  1218.26 MB
-
-bark_model_load: reading bark codec model
-encodec_model_load: model size    =   44.32 MB
-
-bark_model_load: total model size  =    74.64 MB
-
-bark_generate_audio: prompt: 'this is an audio'
-bark_generate_audio: number of tokens in prompt = 513, first 8 tokens: 20579 20172 20199 33733 129595 129595 129595 129595
-bark_forward_text_encoder: ...........................................................................................................
-
-bark_forward_text_encoder: mem per token =     4.80 MB
-bark_forward_text_encoder:   sample time =     7.91 ms
-bark_forward_text_encoder:  predict time =  2779.49 ms / 7.62 ms per token
-bark_forward_text_encoder:    total time =  2829.35 ms
-
-bark_forward_coarse_encoder: .................................................................................................................................................................
-..................................................................................................................................................................
-
-bark_forward_coarse_encoder: mem per token =     8.51 MB
-bark_forward_coarse_encoder:   sample time =     3.08 ms
-bark_forward_coarse_encoder:  predict time = 10997.70 ms / 33.94 ms per token
-bark_forward_coarse_encoder:    total time = 11036.88 ms
-
-bark_forward_fine_encoder: .....
-
-bark_forward_fine_encoder: mem per token =     5.11 MB
-bark_forward_fine_encoder:   sample time =    39.85 ms
-bark_forward_fine_encoder:  predict time = 19773.94 ms
-bark_forward_fine_encoder:    total time = 19873.72 ms
-
-
-
-bark_forward_encodec: mem per token = 760209 bytes
-bark_forward_encodec:  predict time =   528.46 ms / 528.46 ms per token
-bark_forward_encodec:    total time =   663.63 ms
+bark_print_statistics: mem per token =     0.00 MB
+bark_print_statistics:   sample time =     3.96 ms / 410 tokens
+bark_print_statistics:  predict time = 14303.32 ms / 34.89 ms per token
+bark_print_statistics:    total time = 14315.52 ms
 
-Number of frames written = 51840.
+Generating fine tokens: [==================================================>] (100%)
 
+bark_print_statistics: mem per token =     0.00 MB
+bark_print_statistics:   sample time =    41.93 ms / 6144 tokens
+bark_print_statistics:  predict time = 15234.38 ms / 2.48 ms per token
+bark_print_statistics:    total time = 15282.15 ms
+
+Number of frames written = 51840.
 
 main:     load time =  1436.36 ms
 main:     eval time = 34520.53 ms
-main:    total time = 35956.92 ms
+main:    total time = 32786.04 ms
 ```
 
+Here are typical audio pieces generated by `bark.cpp`:
+
+https://github.com/PABannier/bark.cpp/assets/12958149/f9f240fd-975f-4d69-9bb3-b295a61daaff
+
+https://github.com/PABannier/bark.cpp/assets/12958149/c0caadfd-bed9-4a48-8c17-3215963facc1
+
 ## Usage
 
-Here are the steps for the bark model.
+Here are the steps to use Bark.cpp
 
 ### Get the code
 
 ```bash
 git clone --recursive https://github.com/PABannier/bark.cpp.git
 cd bark.cpp
+git submodule update --init --recursive
 ```
 
 ### Build
@@ -165,8 +93,8 @@ cd bark.cpp
 In order to build bark.cpp you must use `CMake`:
 
 ```bash
-mkdir build
-cd build
+mkdir bark/build
+cd bark/build
 cmake ..
 cmake --build . --config Release
 ```
@@ -175,43 +103,43 @@ cmake --build . --config Release
 
 ```bash
 # install Python dependencies
-python3 -m pip install -r requirements.txt
+python3 -m pip install -r bark/requirements.txt
 
 # obtain the original bark and encodec weights and place them in ./models
-python3 download_weights.py --download-dir ./models
+python3 bark/download_weights.py --download-dir ./models
 
 # convert the model to ggml format
-python3 convert.py \
+python3 bark/convert.py \
         --dir-model ./models \
-        --codec-path ./models \
         --vocab-path ./ggml_weights/ \
         --out-dir ./ggml_weights/
 
 # run the inference
-./main -m ./ggml_weights/ -p "this is an audio"
+./bark/build/examples/main/main -m ./ggml_weights/ -p "this is an audio"
 ```
 
 ### (Optional) Quantize weights
 
 Weights can be quantized using the following strategy: `q4_0`, `q4_1`, `q5_0`, `q5_1`, `q8_0`.
 
-Note that to preserve audio quality, we do not quantize the codec model. The bulk of the
-computation is in the forward pass of the GPT models.
+Note that to preserve audio quality, we do not quantize the codec model. The bulk of the computation is in the forward pass of the GPT models.
 
 ```bash
-./quantize ./ggml_weights/ggml_weights_text.bin ./ggml_weights_q4/ggml_weights_text.bin q4_0
-./quantize ./ggml_weights/ggml_weights_coarse.bin ./ggml_weights_q4/ggml_weights_coarse.bin q4_0
-./quantize ./ggml_weights/ggml_weights_fine.bin ./ggml_weights_q4/ggml_weights_fine.bin q4_0
+mkdir ggml_weights_q4
+cp ggml_weights/*vocab* ggml_weights_q4
+./bark/build/examples/quantize/quantize ./ggml_weights/ggml_weights_text.bin ./ggml_weights_q4/ggml_weights_text.bin q4_0
+./bark/build/examples/quantize/quantize ./ggml_weights/ggml_weights_coarse.bin ./ggml_weights_q4/ggml_weights_coarse.bin q4_0
+./bark/build/examples/quantize/quantize ./ggml_weights/ggml_weights_fine.bin ./ggml_weights_q4/ggml_weights_fine.bin q4_0
 ```
 
-### Seminal papers and background on models
+### Seminal papers
 
 - Bark
-    - [Text Prompted Generative Audio](https://github.com/suno-ai/bark)
+  - [Text Prompted Generative Audio](https://github.com/suno-ai/bark)
 - Encodec
-    - [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)
+  - [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438)
 - GPT-3
-    - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
+  - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
 
 ### Contributing
 
@@ -225,5 +153,3 @@ computation is in the forward pass of the GPT models.
 
 - Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
-- Avoid fancy looking modern STL constructs, keep it simple
-- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & ref`
diff --git a/bark-util.h b/bark-util.h
deleted file mode 100644
index 50fd1e3..0000000
--- a/bark-util.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#pragma once
-
-#include <fstream>
-
-#define BARK_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "BARK_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-static const size_t MB = 4*1024*1024;
-
-template<typename T>
-static void read_safe(std::ifstream& fin, T& dest) {
-    fin.read((char*)& dest, sizeof(T));
-}
-
-template<typename T>
-static void write_safe(std::ofstream& fout, T& dest) {
-    fout.write((char*)& dest, sizeof(T));
-}
-
-
-static size_t utf8_len(char src) {
-    const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
-    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
-    return lookup[highbits];
-}
diff --git a/bark.cpp b/bark.cpp
deleted file mode 100644
index 62173f8..0000000
--- a/bark.cpp
+++ /dev/null
@@ -1,2200 +0,0 @@
-/*
-Port of Suno's Bark to C/C++.
-
-Author: Pierre-Antoine Bannier <pierreantoine.bannier@gmail.com>
-*/
-#include "bark.h"
-#include "ggml.h"
-#include "bark-util.h"
-
-// third-party utilities
-#define DR_WAV_IMPLEMENTATION
-#include "dr_wav.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <random>
-#include <regex>
-#include <string>
-
-#define BARK_DEBUG 0
-#define EPS_NORM 1e-8
-
-typedef std::vector<int32_t> bark_sequence;
-typedef std::vector<float>   audio_arr_t;
-
-typedef std::vector<std::vector<int32_t>> bark_codes;
-
-struct gpt_hparams {
-    int32_t n_in_vocab;
-    int32_t n_out_vocab;
-    int32_t n_layer;
-    int32_t n_head;
-    int32_t n_embd;
-    int32_t block_size;
-    int32_t n_lm_heads;
-    int32_t n_wtes;
-    int32_t ftype;
-
-    int32_t n_codes_given = 1;
-};
-
-struct bark_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-};
-
-struct gpt_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w;
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt_model {
-    gpt_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wpe;
-
-    std::vector<struct ggml_tensor *> wtes;
-    std::vector<struct ggml_tensor *> lm_heads;
-
-    std::vector<gpt_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-
-    //
-    int64_t t_sample_us  = 0;
-    int64_t t_predict_us = 0;
-    int64_t t_main_us    = 0;
-
-    //
-    int64_t n_sample  = 0;
-    int64_t n_predict = 0;
-
-    //
-    int64_t memsize = 0;
-    size_t mem_per_token = 0;
-};
-
-struct bark_model {
-    // encoder
-    gpt_model coarse_model;
-    gpt_model   fine_model;
-    gpt_model   text_model;
-
-    // decoder
-    encodec_model codec_model;
-
-    // vocab
-    bark_vocab vocab;
-
-    int64_t memsize = 0;
-};
-
-struct bark_context {
-    bark_context(bark_model & model) : model(model) {}
-    ~bark_context() {
-        if (model_owner) {
-            delete &model;
-        }
-    }
-
-    std::mt19937 rng;
-
-    bark_model & model;
-
-    bool model_owner = false;
-
-    int64_t t_load_us;
-    int64_t t_start_us;
-
-    bark_sequence tokens;
-    bark_sequence semantic_tokens;
-
-    bark_codes coarse_tokens;
-    bark_codes fine_tokens;
-
-    audio_arr_t audio_arr;
-
-    float temp;
-    float fine_temp;
-
-    float min_eos_p;
-    int sliding_window_size;
-    int max_coarse_history;
-};
-
-struct bark_progress {
-    float current = 0.0f;
-    const char * func;
-
-    bark_progress(const char * func): func(func) {}
-
-    void callback(float progress) {
-        float percentage = progress * 100;
-        if (percentage == 0.0f) {
-            fprintf(stderr, "%s: ", func);
-        }
-        while (percentage > current) {
-            current = percentage;
-            fprintf(stderr, ".");
-            fflush(stderr);
-            if (percentage >= 100) {
-                fprintf(stderr, "\n");
-            }
-        }
-    }
-};
-
-struct bark_context * bark_new_context_with_model(
-                struct bark_model * model,
-       struct bark_context_params  params) {
-
-    if (!model) {
-        return nullptr;
-    }
-
-    bark_context * ctx = new bark_context(*model);
-
-    ctx->rng = std::mt19937(params.seed);
-
-    ctx->temp = params.temp;
-    ctx->fine_temp = params.fine_temp;
-
-    ctx->max_coarse_history = params.max_coarse_history;
-    ctx->sliding_window_size = params.sliding_window_size;
-    ctx->min_eos_p = params.min_eos_p;
-
-    return ctx;
-}
-
-struct bark_context_params bark_context_default_params() {
-    struct bark_context_params result = {
-        /*.seed                        =*/ 0,
-        /*.temp                        =*/ 0.7,
-        /*.fine_temp                   =*/ 0.5,
-        /*.min_eos_p                   =*/ 0.2,
-        /*.sliding_window_size         =*/ 60,
-        /*.max_coarse_history          =*/ 630,
-    };
-
-    return result;
-}
-
-void bark_seed_rng(struct bark_context * ctx, int32_t seed) {
-    if (ctx) {
-        ctx->rng.seed(seed);
-    }
-}
-
-int bark_vocab_load(
-            const char * fname,
-            bark_vocab * vocab,
-               int32_t   expected_size) {
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: faield to open '%s'\n", __func__, fname);
-        return 1;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname);
-            return 1;
-        }
-    }
-
-    int32_t n_vocab;
-    read_safe(fin, n_vocab);
-
-    // 5 special tokens: [UNK, SEP, MASK, PAD, CLS]
-    if (n_vocab != expected_size) {
-        fprintf(stderr, "%s: wrong voculary size (%d != %d)\n", __func__, n_vocab, expected_size);
-        return 1;
-    }
-
-    std::string word;
-    std::vector<char> tmp;
-
-    tmp.reserve(128);
-
-    for (int i = 0; i < n_vocab; i++) {
-        uint32_t len;
-        read_safe(fin, len);
-
-        if (len > 0) {
-            tmp.resize(len);
-            fin.read(&tmp[0], tmp.size()); // read to buffer
-            word.assign(&tmp[0], tmp.size());
-        } else {
-            word = "";
-        }
-
-        vocab->token_to_id[word] = i;
-        vocab->id_to_token[i] = word;
-    }
-
-    return 0;
-}
-
-int gpt_model_load(const std::string& fname, gpt_model& model) {
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return 1;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return 1;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        read_safe(fin, hparams.n_layer);
-        read_safe(fin, hparams.n_head);
-        read_safe(fin, hparams.n_embd);
-        read_safe(fin, hparams.block_size);
-        read_safe(fin, hparams.n_in_vocab);
-        read_safe(fin, hparams.n_out_vocab);
-        read_safe(fin, hparams.n_lm_heads);
-        read_safe(fin, hparams.n_wtes);
-        read_safe(fin, hparams.ftype);
-
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
-        printf("%s: n_in_vocab  = %d\n", __func__, hparams.n_in_vocab);
-        printf("%s: n_out_vocab = %d\n", __func__, hparams.n_out_vocab);
-        printf("%s: block_size  = %d\n", __func__, hparams.block_size);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_lm_heads  = %d\n", __func__, hparams.n_lm_heads);
-        printf("%s: n_wtes      = %d\n", __func__, hparams.n_wtes);
-        printf("%s: ftype       = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr       = %d\n", __func__, qntvr);
-
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
-    // in order to save memory and also to speed up the computation
-    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-    if (wtype == GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
-                __func__, fname.c_str(), model.hparams.ftype);
-        return 1;
-    }
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd      = hparams.n_embd;
-        const int n_layer     = hparams.n_layer;
-        const int block_size  = hparams.block_size;
-        const int n_in_vocab  = hparams.n_in_vocab;
-        const int n_out_vocab = hparams.n_out_vocab;
-        const int n_lm_heads  = hparams.n_lm_heads;
-        const int n_wtes      = hparams.n_wtes;
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_wtes*n_in_vocab*n_embd*ggml_type_sizef(wtype);     // wte
-        ctx_size += block_size*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_lm_heads*n_out_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-
-        ctx_size += block_size*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += block_size*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*512; // object overhead
-
-        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ false,
-        };
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return 1;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd      = hparams.n_embd;
-        const int n_layer     = hparams.n_layer;
-        const int block_size  = hparams.block_size;
-        const int n_in_vocab  = hparams.n_in_vocab;
-        const int n_out_vocab = hparams.n_out_vocab;
-        const int n_lm_heads  = hparams.n_lm_heads;
-        const int n_wtes      = hparams.n_wtes;
-
-        model.layers.resize(n_layer);
-        model.lm_heads.resize(n_lm_heads);
-        model.wtes.resize(n_wtes);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, block_size);
-
-        for (int i = 0; i < n_wtes; i++) {
-            model.wtes[i] = ggml_new_tensor_2d(ctx, wtype, n_embd, n_in_vocab);
-            model.tensors["model/wte/" + std::to_string(i)] = model.wtes[i];
-        }
-
-        for (int i = 0; i < n_lm_heads; i++) {
-            model.lm_heads[i] = ggml_new_tensor_2d(ctx, wtype, n_embd, n_out_vocab);
-            model.tensors["model/lm_head/" + std::to_string(i)] = model.lm_heads[i];
-        }
-
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wpe"]     = model.wpe;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
-            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
-            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd     = hparams.n_embd;
-        const int n_layer    = hparams.n_layer;
-        const int block_size = hparams.block_size;
-
-        const int n_mem      = n_layer*block_size;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        while(true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ttype;
-
-            read_safe(fin, n_dims);
-            read_safe(fin, length);
-            read_safe(fin, ttype);
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                read_safe(fin, ne[i]);
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name.data()) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return 1;
-            }
-
-            auto tensor = model.tensors[name.data()];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                return 1;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
-                return 1;
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
-
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                return 1;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            // printf("%48s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], "float", ggml_nbytes(tensor)/1024.0/1024.0);
-
-            total_size += ggml_nbytes(tensor);
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-        model.memsize = total_size;
-    }
-
-    fin.close();
-
-    return 0;
-}
-
-struct bark_model * bark_load_model_from_file(const char * dirname) {
-    printf("%s: loading model from '%s'\n", __func__, dirname);
-
-    bark_model * model = new bark_model;
-
-    // text
-    {
-        printf("%s: reading bark text model\n", __func__);
-        const std::string fname = std::string(dirname) + "/ggml_weights_text.bin";
-        if (gpt_model_load(fname, model->text_model) > 0) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str());
-            return nullptr;
-        }
-        model->memsize += model->text_model.memsize;
-    }
-
-    // vocab
-    {
-        printf("%s: reading bark vocab\n", __func__);
-        const std::string fname     = std::string(dirname) + "/ggml_vocab.bin";
-        const gpt_hparams hparams   = model->text_model.hparams;
-        const int32_t expected_size = hparams.n_in_vocab - hparams.n_out_vocab - 5;
-        if (bark_vocab_load(fname.c_str(), &model->vocab, expected_size) > 0) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str());
-            return nullptr;
-        }
-    }
-
-    // coarse
-    {
-        printf("\n%s: reading bark coarse model\n", __func__);
-        const std::string fname = std::string(dirname) + "/ggml_weights_coarse.bin";
-        if (gpt_model_load(fname, model->coarse_model) > 0) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad coarse)\n", __func__, fname.c_str());
-            return nullptr;
-        }
-        model->memsize += model->coarse_model.memsize;
-    }
-
-    // fine
-    {
-        printf("\n%s: reading bark fine model\n", __func__);
-        const std::string fname = std::string(dirname) + "/ggml_weights_fine.bin";
-        if (gpt_model_load(fname, model->fine_model) > 0) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad fine)\n", __func__, fname.c_str());
-            return nullptr;
-        }
-        model->memsize += model->fine_model.memsize;
-    }
-
-    // codec
-    {
-        printf("\n%s: reading bark codec model\n", __func__);
-        const std::string fname = std::string(dirname) + "/ggml_weights_codec.bin";
-        if (encodec_model_load(fname, model->codec_model) > 0) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad codec)\n", __func__, fname.c_str());
-            return nullptr;
-        }
-        model->memsize += model->codec_model.memsize;
-    }
-
-    printf("\n%s: total model size  = %8.2f MB\n", __func__, model->memsize/1024.0/1024.0);
-
-    return model;
-}
-
-int ggml_common_quantize_0(
-        std::ifstream & fin,
-        std::ofstream & fout,
-        const ggml_ftype ftype,
-        const std::vector<std::string> & to_quant,
-        const std::vector<std::string> & to_skip) {
-
-    ggml_type qtype = GGML_TYPE_F32;
-
-    switch (ftype) {
-        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
-        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
-        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
-        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
-        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
-        case GGML_FTYPE_UNKNOWN:
-        case GGML_FTYPE_ALL_F32:
-        case GGML_FTYPE_MOSTLY_F16:
-        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
-        case GGML_FTYPE_MOSTLY_Q2_K:
-        case GGML_FTYPE_MOSTLY_Q3_K:
-        case GGML_FTYPE_MOSTLY_Q4_K:
-        case GGML_FTYPE_MOSTLY_Q5_K:
-        case GGML_FTYPE_MOSTLY_Q6_K:
-                {
-                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
-                    return 1;
-                }
-    };
-
-    if (!ggml_is_quantized(qtype)) {
-        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
-        return 1;
-    }
-
-    size_t total_size_org = 0;
-    size_t total_size_new = 0;
-
-    std::vector<float> work;
-
-    std::vector<uint8_t>     data_u8;
-    std::vector<ggml_fp16_t> data_f16;
-    std::vector<float>       data_f32;
-
-    std::vector<int64_t> hist_all(1 << 4, 0);
-
-    while (true) {
-        int32_t n_dims;
-        int32_t length;
-        int32_t ttype;
-
-        read_safe(fin, n_dims);
-        read_safe(fin, length);
-        read_safe(fin, ttype);
-
-        if (fin.eof()) {
-            break;
-        }
-
-        int32_t nelements = 1;
-        int32_t ne[4] = { 1, 1, 1, 1 };
-        for (int i = 0; i < n_dims; ++i) {
-            read_safe(fin, ne[i]);
-            nelements *= ne[i];
-        }
-
-        std::string name(length, 0);
-        fin.read(&name[0], length);
-
-        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
-
-        bool quantize = false;
-
-        // check if we should quantize this tensor
-        for (const auto & s : to_quant) {
-            if (std::regex_match(name, std::regex(s))) {
-                quantize = true;
-                break;
-            }
-        }
-
-        // check if we should skip this tensor
-        for (const auto & s : to_skip) {
-            if (std::regex_match(name, std::regex(s))) {
-                quantize = false;
-                break;
-            }
-        }
-
-        // quantize only 2D tensors
-        quantize &= (n_dims == 2);
-
-        if (quantize) {
-            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
-                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
-                return 1;
-            }
-
-            if (ttype == GGML_TYPE_F16) {
-                data_f16.resize(nelements);
-                fin.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
-                data_f32.resize(nelements);
-                for (int i = 0; i < nelements; ++i) {
-                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
-                }
-            } else {
-                data_f32.resize(nelements);
-                fin.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
-            }
-
-            ttype = qtype;
-        } else {
-            const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
-
-            data_u8.resize(nelements*bpe);
-            fin.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
-        }
-
-        write_safe(fout, n_dims);
-        write_safe(fout, length);
-        write_safe(fout, ttype);
-
-        for (int i = 0; i < n_dims; ++i) {
-            write_safe(fout, ne[i]);
-        }
-        fout.write(&name[0], length);
-
-        if (quantize) {
-            work.resize(nelements); // for quantization
-
-            size_t cur_size = 0;
-            std::vector<int64_t> hist_cur(1 << 4, 0);
-
-            switch ((ggml_type) ttype) {
-                case GGML_TYPE_Q4_0:
-                    {
-                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_Q4_1:
-                    {
-                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_Q5_0:
-                    {
-                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_Q5_1:
-                    {
-                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_Q8_0:
-                    {
-                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
-                case GGML_TYPE_F32:
-                case GGML_TYPE_F16:
-                case GGML_TYPE_I8:
-                case GGML_TYPE_I16:
-                case GGML_TYPE_I32:
-                case GGML_TYPE_Q8_1:
-                case GGML_TYPE_Q2_K:
-                case GGML_TYPE_Q3_K:
-                case GGML_TYPE_Q4_K:
-                case GGML_TYPE_Q5_K:
-                case GGML_TYPE_Q6_K:
-                case GGML_TYPE_Q8_K:
-                case GGML_TYPE_COUNT:
-                    {
-                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
-                        return 1;
-                    }
-            }
-
-            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
-            total_size_new += cur_size;
-
-            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
-            for (int i = 0; i < (int) hist_cur.size(); ++i) {
-                hist_all[i] += hist_cur[i];
-            }
-
-            for (int i = 0; i < (int) hist_cur.size(); ++i) {
-                printf("%5.3f ", hist_cur[i] / (float)nelements);
-            }
-            printf("\n");
-        } else {
-            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
-            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
-            total_size_new += data_u8.size();
-        }
-
-        total_size_org += nelements * sizeof(float);
-    }
-
-    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
-    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
-
-    {
-        int64_t sum_all = 0;
-        for (int i = 0; i < (int) hist_all.size(); ++i) {
-            sum_all += hist_all[i];
-        }
-
-        printf("%s: hist: ", __func__);
-        for (int i = 0; i < (int) hist_all.size(); ++i) {
-            printf("%5.3f ", hist_all[i] / (float)sum_all);
-        }
-        printf("\n");
-    }
-
-    return 0;
-}
-
-int bark_model_quantize(
-        const char * fname_inp,
-        const char * fname_out,
-        ggml_ftype   ftype) {
-    printf("%s: loading model from '%s'\n", __func__, fname_inp);
-
-    gpt_model model;
-
-    auto fin = std::ifstream(fname_inp, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp);
-        return 1;
-    }
-
-    auto fout = std::ofstream(fname_out, std::ios::binary);
-    if (!fout) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out);
-        return 1;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp);
-            return 1;
-        }
-
-        fout.write((char *) &magic, sizeof(magic));
-    }
-
-    gpt_hparams hparams;
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        read_safe(fin, hparams.n_layer);
-        read_safe(fin, hparams.n_head);
-        read_safe(fin, hparams.n_embd);
-        read_safe(fin, hparams.block_size);
-        read_safe(fin, hparams.n_in_vocab);
-        read_safe(fin, hparams.n_out_vocab);
-        read_safe(fin, hparams.n_lm_heads);
-        read_safe(fin, hparams.n_wtes);
-        read_safe(fin, hparams.ftype);
-
-        const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-        int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
-
-        printf("%s: n_in_vocab  = %d\n", __func__, hparams.n_in_vocab);
-        printf("%s: n_out_vocab = %d\n", __func__, hparams.n_out_vocab);
-        printf("%s: block_size  = %d\n", __func__, hparams.block_size);
-        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
-        printf("%s: n_lm_heads  = %d\n", __func__, hparams.n_lm_heads);
-        printf("%s: n_wtes      = %d\n", __func__, hparams.n_wtes);
-        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
-        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
-        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
-        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
-
-        write_safe(fout, hparams.n_layer);
-        write_safe(fout, hparams.n_head);
-        write_safe(fout, hparams.n_embd);
-        write_safe(fout, hparams.block_size);
-        write_safe(fout, hparams.n_in_vocab);
-        write_safe(fout, hparams.n_out_vocab);
-        write_safe(fout, hparams.n_lm_heads);
-        write_safe(fout, hparams.n_wtes);
-        write_safe(fout, ftype_dst);
-    }
-
-    // regexes of tensor names to be quantized
-    const std::vector<std::string> to_quant = {
-        "model/wte/.*",
-        "model/lm_head/.*",
-        "model/h.*/attn/c_attn/w",
-        "model/h.*/attn/c_proj/w",
-        "model/h.*/mlp/c_fc/w",
-        "model/h.*/mlp/c_proj/w",
-    };
-
-    if (ggml_common_quantize_0(fin, fout, ftype, to_quant, {}) > 0) {
-        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp);
-        return 1;
-    }
-
-    fin.close();
-    fout.close();
-
-    return 0;
-}
-
-std::string strip_accents(const std::string &in_str) {
-    std::string out_str;
-    std::map<std::string, char> accent_map = {{"À", 'A'},{"Á", 'A'},
-        {"Â", 'A'},{"Ã", 'A'},{"Ä", 'A'},{"Å", 'A'},{"à", 'a'},{"á", 'a'},
-        {"â", 'a'},{"ã", 'a'},{"ä", 'a'},{"å", 'a'},{"È", 'E'},{"É", 'E'},
-        {"Ê", 'E'},{"Ë", 'E'},{"è", 'e'},{"é", 'e'},{"ê", 'e'},{"ë", 'e'},
-        {"Ì", 'I'},{"Í", 'I'},{"Î", 'I'},{"Ï", 'I'},{"ì", 'i'},{"í", 'i'},
-        {"î", 'i'},{"ï", 'i'},{"Ò", 'O'},{"Ó", 'O'},{"Ô", 'O'},{"Õ", 'O'},
-        {"Ö", 'O'},{"ò", 'o'},{"ó", 'o'},{"ô", 'o'},{"õ", 'o'},{"ö", 'o'},
-        {"Ù", 'U'},{"Ú", 'U'},{"Û", 'U'},{"Ü", 'U'},{"ù", 'u'},{"ú", 'u'},
-        {"û", 'u'},{"ü", 'u'},{"Ý", 'Y'},{"ý", 'y'},{"Ç", 'C'},{"ç", 'c'},
-        {"Ñ", 'N'},{"ñ", 'n'},
-    };
-
-    for (size_t i = 0; i < in_str.length();) {
-        int len = utf8_len(in_str[i]);
-        std::string cur = in_str.substr(i, len);
-        auto iter = accent_map.find(cur);
-        if (iter != accent_map.end())
-            out_str += iter->second;
-        else
-            out_str += cur;
-
-        i += len;
-    }
-
-    return out_str;
-}
-
-void bert_tokenize(
-        const bark_vocab * vocab,
-              const char * text,
-                 int32_t * tokens,
-                 int32_t * n_tokens,
-                 int32_t   n_max_tokens) {
-    std::string str = text;
-    std::vector<std::string> words;
-
-    int32_t t = 0;
-
-    auto * token_map = &vocab->token_to_id;
-
-    // split the text into words
-    {
-        str = strip_accents(text);
-
-        std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)";
-
-        std::regex re(pat);
-        std::smatch m;
-
-        while (std::regex_search(str, m, re)) {
-            for (std::string x : m)
-                words.push_back(x);
-            str = m.suffix();
-        }
-    }
-
-    // apply wordpiece
-    for (const auto &word : words) {
-        if (word.size() == 0)
-            continue;
-
-        std::string prefix = "";
-        int i = 0;
-        int n = word.size();
-
-        loop:
-            while (i < n) {
-                if (t >= n_max_tokens - 1)
-                    break;
-                int j = n;
-                while (j > i) {
-                    auto it = token_map->find(prefix + word.substr(i, j - i));
-                    if (it != token_map->end()) {
-                        tokens[t++] = it->second;
-                        i = j;
-                        prefix = "##";
-                        goto loop;
-                    }
-                    --j;
-                }
-                if (j == i) {
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
-                    prefix = "##";
-                    ++i;
-                }
-            }
-        }
-
-    *n_tokens = t;
-}
-
-static struct ggml_cgraph * bark_build_fine_gpt_graph(
-             ggml_context * ctx0,
-                gpt_model * model,
-               bark_token * tokens,
-                      int   n_tokens,
-                      int   codebook_ix) {
-    // tokens: [n_channels, N]
-    const int N          = n_tokens/N_FINE_CODEBOOKS;
-    const int n_channels = N_FINE_CODEBOOKS;
-
-    const auto & hparams = model->hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.block_size;
-    const int n_head  = hparams.n_head;
-
-    const int n_codes_given = hparams.n_codes_given;
-
-    BARK_ASSERT(N <= n_ctx);
-    BARK_ASSERT(codebook_ix > 0);
-
-    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
-
-    struct ggml_tensor * inpL;
-    struct ggml_tensor * cur;
-
-    struct ggml_tensor * input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, N, n_channels);
-    memcpy(input->data, tokens, N*n_channels*ggml_element_size(input));
-    ggml_set_name(input, "input_tokens");
-
-    struct ggml_tensor * tok_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
-    ggml_set_name(tok_emb, "token_embeddings");
-    ggml_set_zero(tok_emb);
-
-    for (int wte_ix = 0; wte_ix < codebook_ix + 1; wte_ix++) {
-        struct ggml_tensor * cur = ggml_get_rows(ctx0,
-                        model->wtes[wte_ix],
-                        ggml_view_1d(ctx0, input, N, wte_ix*input->nb[1]));
-        tok_emb = ggml_add(ctx0, tok_emb, cur);
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = i;
-    }
-    struct ggml_tensor * pos_emb = ggml_get_rows(ctx0, model->wpe, position);
-    ggml_set_name(pos_emb, "position_embeddings");
-
-    // wte + wpe
-    inpL = ggml_add(ctx0, tok_emb, pos_emb);
-
-    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
-    ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
-
-    for (int il = 0; il < n_layer; il++) {
-        ggml_format_name(inpL, "layer_inp_%d", il);
-
-        // norm
-        {
-            cur = ggml_norm(ctx0, inpL, EPS_NORM);
-            ggml_set_name(cur, "norm_0");
-
-            // cur = ln_1_g*cur + ln_1_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model->layers[il].ln_1_b, cur));
-            ggml_set_name(cur, "layer_norm_0");
-        }
-
-        // self-attention
-        {
-            // cur = attn_w*cur
-            cur = ggml_mul_mat(ctx0, model->layers[il].c_attn_attn_w, cur);
-            ggml_set_name(cur, "attn_in_proj");
-
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            ggml_set_name(Qcur, "Qcur");
-
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            ggml_set_name(Kcur, "Kcur");
-
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-            ggml_set_name(Vcur, "Vcur");
-
-            // [n_embd/n_head, N, n_head]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-            ggml_set_name(Q, "Q");
-
-            // [n_embd/n_head, N, n_head]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Kcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-            ggml_set_name(K, "K");
-
-            // [N, N, n_head]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            ggml_set_name(KQ, "KQ");
-
-            // [N, N, n_head]
-            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
-            ggml_set_name(KQ_scaled, "KQ_scaled");
-
-            // [N, N, n_head]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
-            ggml_set_name(KQ_soft_max, "KQ_soft_max");
-
-            // [N, n_embd/n_head, n_head]
-            struct ggml_tensor * V_trans =
-                ggml_cont(ctx0,
-                    ggml_permute(ctx0,
-                            ggml_cpy(ctx0,
-                                Vcur,
-                                ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                            1, 2, 0, 3));
-            ggml_set_name(V_trans, "V_trans");
-
-            // [n_embd/n_head, N, n_head]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-            ggml_set_name(KQV, "KQV");
-
-            // [n_embd/n_head, n_head, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            ggml_set_name(KQV_merged, "KQV_merged");
-
-            // [n_embd, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-            ggml_set_name(cur, "KQV_merged_contiguous");
-
-            // cur = proj_w*cur
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].c_attn_proj_w,
-                    cur);
-            ggml_set_name(cur, "attn_out_proj");
-        }
-
-        // residual connection
-        struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
-        ggml_set_name(inpFF, "inpFF");
-
-        // feed-forward
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, EPS_NORM);
-                ggml_set_name(cur, "norm_1");
-
-                // cur = ln_2_g*cur + ln_2_b
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model->layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model->layers[il].ln_2_b, cur));
-                ggml_set_name(cur, "ffn_norm");
-            }
-
-            // cur = fc_w*cur
-            cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_fc_w, cur);
-            ggml_set_name(cur, "ffn_fc");
-
-            // GELU activation
-            cur = ggml_gelu(ctx0, cur);
-            ggml_set_name(cur, "ffn_gelu");
-
-            // cur = proj_w*cur
-            cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_proj_w, cur);
-            ggml_set_name(cur, "ffn_out_proj");
-        }
-
-        cur = ggml_add(ctx0, cur, inpFF);
-        ggml_set_name(cur, "inpFF_+_outFF");
-
-        // input for next layer
-        inpL = cur;
-    }
-
-    cur = inpL;
-
-    // norm
-    {
-        cur = ggml_norm(ctx0, cur, EPS_NORM);
-        ggml_set_name(cur, "norm_final");
-
-        // cur = ln_f_g*cur + ln_f_b
-        cur = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->ln_f_g, cur),
-                    cur),
-                ggml_repeat(ctx0, model->ln_f_b, cur));
-        ggml_set_name(cur, "result_norm");
-    }
-
-    // cur = WTE * cur
-    struct ggml_tensor * lm_head = model->lm_heads[codebook_ix - n_codes_given];
-    cur = ggml_mul_mat(ctx0, lm_head, cur);
-    ggml_set_name(cur, "result_output");
-
-    ggml_build_forward_expand(gf, cur);
-
-    return gf;
-}
-
-int fine_gpt_eval(
-          gpt_model * model,
-         bark_token * tokens,
-                int   n_tokens,
-              float * logits,
-                int   n_threads,
-                int   codebook_ix) {
-    // tokens: [n_channels, seq_length], sequences are contiguous
-    int64_t t_predict_start_us = ggml_time_us();
-
-    const int N = n_tokens/8;
-    const int n_channels = 8;
-
-    const auto & hparams = model->hparams;
-
-    const int n_vocab = hparams.n_out_vocab;
-
-    GGML_ASSERT((N > 1) && (n_channels == 8));
-    GGML_ASSERT(n_threads > 0);
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (model->mem_per_token > 0 && model->mem_per_token*n_tokens > buf_size) {
-        const size_t buf_size_new = 1.2*(model->mem_per_token*n_tokens); // add 20% to account for ggml object overhead
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return 1;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    ggml_cgraph * gf = bark_build_fine_gpt_graph(ctx0, model, tokens, n_tokens, codebook_ix);
-
-    struct ggml_tensor * res        = gf->nodes[gf->n_nodes - 1];
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
-
-    GGML_ASSERT(strcmp(res->name,        "result_output") == 0);
-    GGML_ASSERT(strcmp(embeddings->name, "result_norm")   == 0);
-
-    // run the computation
-    ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
-
-    if (logits != NULL) {
-        // [N, n_vocab]
-        // [1024, 1056]
-        memcpy(logits, (float *) ggml_get_data(res), sizeof(float)*N*n_vocab);
-    }
-
-    if (model->mem_per_token == 0) {
-        model->mem_per_token = ggml_used_mem(ctx0)/n_tokens;
-    }
-
-    ggml_free(ctx0);
-
-    int64_t t_predict_end_us = ggml_time_us();
-    model->t_predict_us += (t_predict_end_us - t_predict_start_us);
-    model->n_predict += 1;
-
-    return 0;
-}
-
-bool gpt_eval(
-          gpt_model * model,
-         bark_token * tokens,
-                int   n_tokens,
-              float * logits,
-                int * n_past,
-               bool   merge_ctx,
-                int   n_threads) {
-    BARK_ASSERT(n_past != NULL);
-
-    int64_t t_predict_start_us = ggml_time_us();
-
-    int N = n_tokens;
-
-    const auto & hparams = model->hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.block_size;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_out_vocab;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (model->mem_per_token > 0 && model->mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.2*(model->mem_per_token*N); // add 20% to account for ggml object overhead
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return 1;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * input = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(input->data, tokens, N*ggml_element_size(input));
-
-    struct ggml_tensor * tok_emb;
-
-    if (*n_past > 0) {
-        BARK_ASSERT(N == 1);
-        tok_emb = ggml_get_rows(ctx0, model->wtes[0], input);
-    } else {
-        if (merge_ctx) {
-            BARK_ASSERT(N == 256+256+1);
-            N -= 256;
-        } else {
-            BARK_ASSERT(N <= n_ctx);
-        }
-
-        if (merge_ctx) {
-            struct ggml_tensor * seq_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input, 256, 0));
-            struct ggml_tensor * ctx_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input, 256, 256*ggml_element_size(input)));
-            struct ggml_tensor * rem_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input,   1, 512*ggml_element_size(input)));
-
-            struct ggml_tensor * cat_emb = ggml_add(ctx0, seq_embd, ctx_embd);
-
-            tok_emb = ggml_new_tensor_2d(ctx0, cat_emb->type, cat_emb->ne[0], cat_emb->ne[1]+rem_embd->ne[1]);
-            tok_emb = ggml_set_1d(ctx0, tok_emb, cat_emb, 0);
-            tok_emb = ggml_set_1d(ctx0, tok_emb, rem_embd, cat_emb->ne[0]*cat_emb->ne[1]*ggml_element_size(cat_emb));
-        } else {
-            tok_emb = ggml_get_rows(ctx0, model->wtes[0], input);
-        }
-    }
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = *n_past + i;
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL = ggml_add(ctx0, tok_emb, ggml_get_rows(ctx0, model->wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, EPS_NORM);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model->layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model->layers[il].ln_1_b, cur));
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].c_attn_attn_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model->layers[il].c_attn_attn_b, cur),
-                    cur);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model->memory_k, N*n_embd, (ggml_element_size(model->memory_k)*n_embd)*(il*n_ctx + *n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model->memory_v, N*n_embd, (ggml_element_size(model->memory_v)*n_embd)*(il*n_ctx + *n_past));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model->memory_k, (*n_past + N)*n_embd, il*n_ctx*ggml_element_size(model->memory_k)*n_embd),
-                            n_embd/n_head, n_head, *n_past + N),
-                        0, 2, 1, 3);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale_inplace(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, *n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model->memory_v, (*n_past + N)*n_embd, il*n_ctx*ggml_element_size(model->memory_v)*n_embd),
-                                n_embd/n_head, n_head, *n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model->memory_v->type, *n_past + N, n_embd/n_head, n_head));
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].c_attn_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model->layers[il].c_attn_proj_b, cur),
-                    cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF, EPS_NORM);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model->layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model->layers[il].ln_2_b, cur));
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].c_mlp_fc_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model->layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model->layers[il].c_mlp_proj_w,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model->layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, EPS_NORM);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model->ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model->ln_f_b, inpL));
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.lm_head
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model->lm_heads[0], inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
-
-    if (logits != NULL) {
-        // return result just for the last token
-        memcpy(logits, (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-    }
-
-    if (model->mem_per_token == 0) {
-        model->mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-
-    // updating n_past with N (-256 if merge_ctx)
-    if (n_past)
-        *n_past += N;
-
-    ggml_free(ctx0);
-
-    model->t_predict_us += (ggml_time_us() - t_predict_start_us);
-    model->n_predict += 1;
-
-    return 0;
-}
-
-void softmax(std::vector<float> & logits) {
-    // for numerical stability
-    float maxl = -INFINITY;
-    for (const auto & l : logits)
-        maxl = std::max(maxl, l);
-
-    // softmax
-    float sum = 0.0;
-    for (auto & l : logits) {
-        l = exp(l - maxl);
-        sum += l;
-    }
-
-    for (auto & l : logits)
-        l /= sum;
-}
-
-bark_token gpt_multinomial_sample(
-        std::vector<float> & logits,
-        std::mt19937 & rng,
-        float temp,
-        float * eos_p) {
-    int n_logits = logits.size();
-
-    for (int i = 0; i < n_logits; ++i)
-        logits[i] /= temp;
-
-    softmax(logits);
-
-    std::discrete_distribution<bark_token> dist(logits.begin(), logits.end());
-    int next = dist(rng);
-
-    // likelihood of EOS token
-    if (eos_p)
-        *eos_p = logits[logits.size() - 1];
-
-    return next;
-}
-
-bark_token gpt_argmax_sample(std::vector<float> & logits, float * eos_p) {
-    int n_logits = logits.size();
-
-    // testing purposes
-    for (auto & l : logits) { l /= 0.7f; }
-
-    // likelihood of EOS token
-    softmax(logits);
-
-    if (eos_p)
-        *eos_p = logits[logits.size() - 1];
-
-    int next = 0;
-    float maxl = -INFINITY;
-
-    for (int i = 0; i < n_logits; i++) {
-        if (logits[i] > maxl) {
-            maxl = logits[i];
-            next = i;
-        }
-    }
-
-    return next;
-}
-
-bark_token gpt_sample(
-            std::vector<float> & logits,
-                  std::mt19937 & rng,
-                         float   temp,
-                         float * eos_p,
-                       int64_t * t_sample_us,
-                       int64_t * n_sample) {
-    int64_t t_sample_start_us = ggml_time_us();
-
-    bark_token res;
-    if (temp == 0.0f) {
-        res = gpt_argmax_sample(logits, eos_p);
-    } else {
-        res = gpt_multinomial_sample(logits, rng, temp, eos_p);
-    }
-
-    int64_t t_sample_end_us = ggml_time_us();
-    *t_sample_us += (t_sample_end_us - t_sample_start_us);
-    *n_sample += 1;
-
-    return res;
-}
-
-void bark_tokenize_input(struct bark_context * ctx, const char * text) {
-    auto & model = ctx->model.text_model;
-    bark_vocab * vocab = &ctx->model.vocab;
-
-    int32_t block_size = model.hparams.block_size;
-    int32_t max_ctx_size = std::min(block_size, 256);
-    int32_t n_tokens;
-
-    bark_sequence tokens(max_ctx_size);
-    bert_tokenize(vocab, text, tokens.data(), &n_tokens, max_ctx_size);
-
-    for (int i = 0; i < (int) tokens.size(); i++)
-        tokens[i] += TEXT_ENCODING_OFFSET;
-
-    if (n_tokens < max_ctx_size) {
-        for (int i = n_tokens; i < max_ctx_size; i++)
-            tokens[i] = TEXT_PAD_TOKEN;
-    } else if (n_tokens > max_ctx_size) {
-        fprintf(stderr, "%s: input sequence is too long (%d > 256), truncating sequence", __func__, n_tokens);
-    }
-
-    tokens.resize(max_ctx_size);
-
-    // semantic history
-    for (int i = 0; i < 256; i++)
-        tokens.push_back(SEMANTIC_PAD_TOKEN);
-    tokens.push_back(SEMANTIC_INFER_TOKEN);
-
-    assert(tokens.size() == 256 + 256 + 1);
-
-    ctx->tokens = tokens;
-
-    printf("%s: prompt: '%s'\n", __func__, text);
-    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, ctx->tokens.size());
-    for (int i = 0; i < std::min(8, (int) ctx->tokens.size()); i++) {
-        printf("%d ", ctx->tokens[i]);
-    }
-    printf("\n");
-}
-
-static void bark_print_statistics(gpt_model * model) {
-    printf("\n\n");
-    printf("%s: mem per token = %8.2f MB\n", __func__, model->mem_per_token/1000.0f/1000.0f);
-    printf("%s:   sample time = %8.2f ms / %lld tokens\n", __func__, model->t_sample_us/1000.0f, model->n_sample);
-    printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, model->t_predict_us/1000.0f, model->t_predict_us/model->n_predict/1000.0f);
-    printf("%s:    total time = %8.2f ms\n", __func__, model->t_main_us/1000.0f);
-    printf("\n");
-}
-
-void bark_forward_text_encoder(struct bark_context * ctx, int n_threads) {
-    const int64_t t_main_start_us = ggml_time_us();
-
-    bark_sequence out;
-
-    bark_progress progress( __func__);
-
-    gpt_model * model = &ctx->model.text_model;
-
-    auto & hparams = model->hparams;
-    const int n_vocab = hparams.n_out_vocab;
-
-    float min_eos_p = ctx->min_eos_p;
-    float temp = ctx->temp;
-
-    bark_sequence input = ctx->tokens;
-
-    std::vector<float> logits;
-    logits.resize(n_vocab);
-
-    float eos_p = 0;
-
-    // dry run to estimate mem_per_token
-    {
-        int n_past = 0;
-        bark_token decoy[4] = { 0, 1, 2, 3 };
-        gpt_eval(model, decoy, 4, nullptr, &n_past, false, n_threads);
-    }
-
-    int n_past = 0;
-
-    for (int i = 0; i < 768; i++) {
-        gpt_eval(model, input.data(), input.size(), logits.data(), &n_past, true, n_threads);
-
-        std::vector<float> relevant_logits(logits.begin(), logits.begin() + SEMANTIC_VOCAB_SIZE);
-        relevant_logits.push_back(logits[SEMANTIC_PAD_TOKEN]);
-
-        input.clear();
-
-        bark_token next = gpt_sample(
-            logits, ctx->rng, temp, &eos_p, &model->t_sample_us, &model->n_sample);
-
-        if (next == SEMANTIC_VOCAB_SIZE || eos_p >= min_eos_p)
-            break;
-
-        input.push_back(next);
-        out.push_back(next);
-
-        progress.callback((float) i/768);
-    }
-
-    ctx->semantic_tokens = out;
-
-    const int64_t t_main_end_us = ggml_time_us();
-    model->t_main_us = t_main_end_us - t_main_start_us;
-
-    bark_print_statistics(model);
-}
-
-void bark_forward_coarse_encoder(struct bark_context * ctx, int n_threads) {
-    const int64_t t_main_start_us = ggml_time_us();
-
-    bark_codes out_coarse;
-    bark_sequence out;
-
-    bark_progress progress(__func__);
-
-    int max_coarse_history = ctx->max_coarse_history;
-    int sliding_window_size = ctx->sliding_window_size;
-    float temp = ctx->temp;
-
-    float semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS;
-    int max_semantic_history = floorf(max_coarse_history / semantic_to_coarse_ratio);
-
-    int n_steps = floorf(ctx->semantic_tokens.size() * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS) * N_COARSE_CODEBOOKS;
-    int step_ix = 0;
-
-    BARK_ASSERT(n_steps > 0);
-    BARK_ASSERT(n_steps % N_COARSE_CODEBOOKS == 0);
-
-    int n_window_steps = ceilf(static_cast<float>(n_steps) / sliding_window_size);
-
-    gpt_model * model = &ctx->model.coarse_model;
-
-    auto & hparams = model->hparams;
-    const int n_vocab = hparams.n_out_vocab;
-
-    bark_sequence input = ctx->semantic_tokens;
-
-    std::vector<float> logits;
-    logits.resize(n_vocab);
-
-    // dry run to estimate mem_per_token
-    {
-        int n_past = 0;
-        bark_token decoy[4] = { 0, 1, 2, 3 };
-        gpt_eval(model, decoy, 4, nullptr, &n_past, false, n_threads);
-    }
-
-    for (int i = 0; i < n_window_steps; i++) {
-        int semantic_ix = roundf(n_steps / semantic_to_coarse_ratio);
-
-        bark_sequence input_in(
-            input.begin() + std::max(semantic_ix-max_semantic_history, 0),
-            input.end()
-        );
-        size_t original_size = input_in.size();
-        input_in.resize(256);
-
-        // padding from the right side
-        for (int ix = original_size; ix < 256; ix++)
-            input_in[ix] = COARSE_SEMANTIC_PAD_TOKEN;
-
-        input_in.push_back(COARSE_INFER_TOKEN);
-
-        // concatenate input_in and input_coarse
-        input_in.insert(
-            input_in.end(),
-            std::make_move_iterator(out.end() - std::min(max_coarse_history, (int) out.size())),
-            std::make_move_iterator(out.end())
-        );
-
-        int n_past = 0;
-        // TODO: this is a hack,
-        model->mem_per_token *= 1.1;  // context length is growing, mem_per_token must grow as well
-
-        for (int j = 0; j < sliding_window_size; j++) {
-            if (step_ix >= n_steps)
-                continue;
-
-            gpt_eval(model, input_in.data(), input_in.size(), logits.data(), &n_past, false, n_threads);
-
-            input_in.clear();
-
-            bool is_major = step_ix % N_COARSE_CODEBOOKS == 0;
-            int start_ix  = SEMANTIC_VOCAB_SIZE + (1 - is_major) * CODEBOOK_SIZE;
-            int end_ix    = SEMANTIC_VOCAB_SIZE + (2 - is_major) * CODEBOOK_SIZE;
-            std::vector<float> relevant_logits(logits.begin() + start_ix, logits.begin() + end_ix);
-
-            bark_token next = gpt_sample(
-                relevant_logits, ctx->rng, temp, NULL, &model->t_sample_us, &model->n_sample);
-
-            next += start_ix;
-
-            input_in.push_back(next);
-            out.push_back(next);
-
-            step_ix += 1;
-
-            progress.callback((float) (i*sliding_window_size+j)/n_steps);
-        }
-    }
-
-    BARK_ASSERT((int) out.size() == n_steps);
-    BARK_ASSERT(out.size() % N_COARSE_CODEBOOKS == 0);
-
-    // out_coarse: [seq_length, n_codes]
-    for (int i = 0; i < (int) out.size(); i += N_COARSE_CODEBOOKS) {
-        // this assumes N_COARSE_CODEBOOKS = 2
-        bark_sequence _tmp = {
-            out[i] - SEMANTIC_VOCAB_SIZE,
-            out[i+1] - SEMANTIC_VOCAB_SIZE - CODEBOOK_SIZE
-        };
-        out_coarse.push_back(_tmp);
-    }
-
-    ctx->coarse_tokens = out_coarse;
-
-    const int64_t t_main_end_us = ggml_time_us();
-    model->t_main_us = t_main_end_us - t_main_start_us;
-
-    bark_print_statistics(model);
-
-}
-
-void bark_forward_fine_encoder(struct bark_context * ctx, int n_threads) {
-    // input shape: [N, n_codes]
-    const int64_t t_main_start_us = ggml_time_us();
-
-    bark_progress progress(__func__);
-
-    bark_codes input = ctx->coarse_tokens;
-
-    float temp = ctx->fine_temp;
-
-    std::vector<float> logits;
-    logits.resize(1024*1056);
-
-    gpt_model * model = &ctx->model.fine_model;
-
-    int n_coarse          = input[0].size();
-    int original_seq_len  = input.size();
-    int n_remove_from_end = 0;
-
-    // channel padding
-    for (int i = 0; i < (int) input.size(); i++) {
-        for (int j = N_COARSE_CODEBOOKS; j < N_FINE_CODEBOOKS; j++) {
-            input[i].push_back(CODEBOOK_SIZE);
-        }
-    }
-
-    // spatial padding if sequence is too short
-    if (original_seq_len < 1024) {
-        n_remove_from_end = 1024 - original_seq_len;
-        for (int i = original_seq_len; i < 1024; i++) {
-            bark_sequence _tmp(N_FINE_CODEBOOKS, CODEBOOK_SIZE);
-            input.push_back(_tmp);
-        }
-    }
-
-    // dry run to estimate mem_per_token
-    bark_token decoy[16] = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-    fine_gpt_eval(model, decoy, 16, nullptr, n_threads, 2);
-
-    int n_loops = std::max(0, (int) ceilf((input.size() - 1024)/512.f)) + 1;
-
-    // in_arr: [seq_length, n_codes]
-    bark_codes in_arr = input;
-
-    for (int n = 0; n < n_loops; n++) {
-        int start_ix          = std::min(n * 512, (int) in_arr.size() - 1024);
-        int start_fill_ix     = std::min(n * 512, (int) in_arr.size() - 512);
-        int rel_start_fill_ix = start_fill_ix - start_ix;
-
-        // in_buffer: [n_codes*seq_length] (sequences are contiguous)
-        bark_sequence in_buffer;
-        for (int i = 0; i < N_FINE_CODEBOOKS; i++) {
-            for (int j = start_ix; j < start_ix + 1024; j++) {
-                in_buffer.push_back(in_arr[j][i]);
-            }
-        }
-
-        for (int nn = n_coarse; nn < N_FINE_CODEBOOKS; nn++) {
-            fine_gpt_eval(model, in_buffer.data(), in_buffer.size(), logits.data(), n_threads, nn);
-
-            for (int i = 0; i < 1024; i++) {
-                std::vector<float> relevant_logits(logits.begin() + i*1056, logits.begin() + (i+1)*1056);
-                relevant_logits.resize(CODEBOOK_SIZE);
-
-                bark_token next = gpt_sample(
-                    relevant_logits, ctx->rng, temp, NULL, &model->t_sample_us, &model->n_sample);
-
-                in_buffer[nn*1024 + rel_start_fill_ix + i] = next;
-            }
-
-            progress.callback((float) (n*(N_FINE_CODEBOOKS-n_coarse)+(nn-n_coarse))/(n_loops*(N_FINE_CODEBOOKS-n_coarse)));
-        }
-
-        // transfer over info into model_in
-        for (int nn = n_coarse; nn < N_FINE_CODEBOOKS; nn++) {
-            for (int j = 0; j < CODEBOOK_SIZE - rel_start_fill_ix; j++) {
-                in_arr[start_fill_ix+j][nn] = in_buffer[nn*1024 + rel_start_fill_ix + j];
-            }
-        }
-
-    }
-
-    if (n_remove_from_end > 0) {
-        in_arr.resize(in_arr.size() - n_remove_from_end);
-    }
-
-    BARK_ASSERT(ctx->coarse_tokens.size() == in_arr.size());
-
-    ctx->fine_tokens = in_arr;
-
-    const int64_t t_main_end_us = ggml_time_us();
-    model->t_main_us = t_main_end_us - t_main_start_us;
-
-    bark_print_statistics(model);
-}
-
-int encodec_eval(
-         const bark_codes & tokens,
-            encodec_model & model,
-              audio_arr_t & audio_arr) {
-    // input shape: [seq_length, n_codes]
-    int64_t t_predict_start_us = ggml_time_us();
-
-    const int N       = tokens.size();
-    const int n_codes = tokens[0].size();
-
-    bark_codes input = tokens;
-
-    static size_t buf_size = 256u*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (model.mem_per_token > 0 && model.mem_per_token*N*n_codes > buf_size) {
-        const size_t buf_size_new = 1.1*(model.mem_per_token*N*n_codes);  // add 10% to account for ggml object overhead
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return 1;
-        }
-    }
-
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
-
-    struct ggml_context * ctx0 = ggml_init(params);
-    struct ggml_cgraph gf = {};
-
-    struct ggml_tensor * codes = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, N, n_codes);
-    for (int c = 0; c < n_codes; c++) {
-        bark_sequence _tmp;
-        for (int i = 0; i < N; i++)
-            _tmp.push_back(input[i][c]);
-        int offset = ggml_element_size(codes)*c*N;
-        memcpy((void *) ((char *) codes->data + offset), _tmp.data(), N*ggml_element_size(codes));
-    }
-
-    struct ggml_tensor * quantized_out = encodec_quantizer_decode_eval(ctx0, model, codes);
-    struct ggml_tensor * output        = encodec_decoder_eval(ctx0, model, quantized_out);
-
-    ggml_build_forward_expand(&gf, output);
-    // TODO: adapt ggml_conv_1d and ggml_conv_trans_1d implementation to use multiple
-    // threads.
-    ggml_graph_compute_with_ctx(ctx0, &gf, 1);
-
-    int out_seq_length = output->ne[0];
-    audio_arr.resize(out_seq_length);
-    memcpy(audio_arr.data(), (float *) ggml_get_data(output), sizeof(float)*out_seq_length);
-
-    if (model.mem_per_token == 0) {
-        model.mem_per_token = ggml_used_mem(ctx0)/N/n_codes;
-    }
-
-    ggml_free(ctx0);
-
-    model.t_predict_us += (ggml_time_us() - t_predict_start_us);
-
-    return 0;
-}
-
-void bark_forward_encodec(struct bark_context * ctx) {
-    const int64_t t_main_start_us = ggml_time_us();
-
-    auto & model = ctx->model.codec_model;
-
-    // dry run to estimate mem_per_token
-    bark_codes toy_data;
-    for (int i = 0; i < 20; i++) {
-        bark_sequence _tmp(4, i);
-        toy_data.push_back(_tmp);
-    }
-    encodec_eval(toy_data, model, ctx->audio_arr);
-
-    // actual run
-    encodec_eval(ctx->fine_tokens, model, ctx->audio_arr);
-
-    const int64_t t_main_end_us = ggml_time_us();
-    model.t_main_us = t_main_end_us - t_main_start_us;
-
-    printf("\n\n");
-    printf("%s: mem per token = %zu bytes\n", __func__, model.mem_per_token);
-    printf("%s:  predict time = %8.2f ms\n", __func__, model.t_predict_us/1000.0f);
-    printf("%s:    total time = %8.2f ms\n", __func__, model.t_main_us/1000.0f);
-    printf("\n");
-}
-
-int write_wav_on_disk(audio_arr_t& audio_arr, std::string dest_path) {
-    drwav_data_format format;
-    format.container     = drwav_container_riff;
-    format.format        = DR_WAVE_FORMAT_IEEE_FLOAT;
-    format.channels      = 1;
-    format.sampleRate    = SAMPLE_RATE;
-    format.bitsPerSample = 32;
-
-    drwav wav;
-    drwav_init_file_write(&wav, dest_path.c_str(), &format, NULL);
-    drwav_uint64 frames = drwav_write_pcm_frames(&wav, audio_arr.size(), audio_arr.data());
-    drwav_uninit(&wav);
-
-    fprintf(stderr, "Number of frames written = %lld.\n", frames);
-
-    return 0;
-}
-
-int bark_generate_audio(
-    struct bark_context * ctx,
-             const char * text,
-             const char * dest_wav_path,
-                    int   n_threads) {
-    bark_tokenize_input(ctx, text);
-
-    bark_forward_text_encoder  (ctx, n_threads);
-    bark_forward_coarse_encoder(ctx, n_threads);
-    bark_forward_fine_encoder  (ctx, n_threads);
-
-    bark_forward_encodec(ctx);
-
-    write_wav_on_disk(ctx->audio_arr, dest_wav_path);
-
-    return 0;
-}
-
-
-void bark_free_model(struct bark_model * model) {
-    delete model;
-}
-
-void bark_free(bark_context * ctx) {
-    ggml_free(ctx->model.coarse_model.ctx);
-    ggml_free(ctx->model.fine_model.ctx);
-    ggml_free(ctx->model.text_model.ctx);
-    ggml_free(ctx->model.codec_model.ctx);
-
-    delete ctx;
-}
diff --git a/bark.h b/bark.h
deleted file mode 100644
index 0e3a7a5..0000000
--- a/bark.h
+++ /dev/null
@@ -1,164 +0,0 @@
-#ifndef BARK_H
-#define BARK_H
-
-#include "encodec.h"
-
-#include <map>
-#include <random>
-#include <thread>
-#include <vector>
-
-#ifdef BARK_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef BARK_BUILD
-#            define BARK_API __declspec(dllexport)
-#        else
-#            define BARK_API __declspec(dllimport)
-#        endif
-#    else
-#        define BARK_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define BARK_API
-#endif
-
-#define SAMPLE_RATE 24000
-
-#define CLS_TOKEN_ID 101
-#define SEP_TOKEN_ID 102
-
-#define TEXT_ENCODING_OFFSET 10048
-#define TEXT_PAD_TOKEN 129595
-
-#define CODEBOOK_SIZE 1024
-#define N_COARSE_CODEBOOKS 2
-#define N_FINE_CODEBOOKS 8
-
-#define SEMANTIC_PAD_TOKEN 10000
-#define SEMANTIC_INFER_TOKEN 129599
-#define SEMANTIC_VOCAB_SIZE 10000
-#define SEMANTIC_RATE_HZ 49.9
-
-#define COARSE_RATE_HZ 75
-#define COARSE_SEMANTIC_PAD_TOKEN 12048
-#define COARSE_INFER_TOKEN 12050
-
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-
-    typedef int32_t bark_token;
-
-    struct bark_context;
-    struct bark_progress;
-
-    struct bark_context_params {
-        uint32_t seed; // RNG seed
-
-        float temp;      // Temperature for sampling (text and coarse encoders)
-        float fine_temp; // Temperature for sampling (fine encoder)
-
-        float min_eos_p;         // Minimum probability for EOS token (text encoder)
-        int sliding_window_size; // Sliding window size for coarse encoder
-        int max_coarse_history;  // Max history for coarse encoder
-    };
-
-    struct bark_model;
-    struct bark_vocab;
-
-    struct gpt_hparams;
-    struct gpt_layer;
-    struct gpt_model;
-
-    BARK_API struct bark_context_params bark_context_default_params(void);
-
-    BARK_API struct bark_context * bark_new_context_with_model(
-               struct bark_model * model,
-      struct bark_context_params   params);
-
-    BARK_API void bark_seed_rng(struct bark_context * ctx, int32_t seed);
-
-    BARK_API void bark_free(struct bark_context * ctx);
-
-    BARK_API void bark_free_model(struct bark_model * ctx);
-
-    BARK_API int bark_generate_audio(
-            struct bark_context * ctx,
-                     const char * text,
-                     const char * dest_wav_path,
-                            int   n_threads);
-
-    BARK_API struct bark_model * bark_load_model_from_file(const char * dirname);
-
-    BARK_API int bark_model_quantize(
-                     const char * fname_inp,
-                     const char * fname_out,
-                     ggml_ftype   ftype);
-
-    BARK_API int bark_vocab_load(
-                     const char * fname,
-                     bark_vocab * vocab,
-                        int32_t   expected_size);
-
-#ifdef __cplusplus
-}
-#endif
-
-#ifdef BARK_API_INTERNAL
-
-    //
-    // Internal API for tests
-    //
-
-    typedef std::vector<bark_token> bark_sequence;
-    typedef std::vector<std::vector<bark_token>> bark_codes;
-    typedef std::vector<float> audio_arr_t;
-
-    int gpt_model_load(const std::string& fname, gpt_model& model);
-
-    int gpt_eval(
-                  gpt_model * model,
-                 bark_token * tokens,
-                        int   n_tokens,
-                      float * logits,
-                        int * n_past,
-                       bool   merge_ctx,
-                        int   n_threads);
-
-    bool fine_gpt_eval(
-                  gpt_model * model,
-                 bark_token * tokens,
-                        int   n_tokens,
-                      float * logits,
-                        int   n_threads,
-                        int   codebook_ix);
-
-    void bert_tokenize(
-           const bark_vocab * vocab,
-                 const char * text,
-                    int32_t * tokens,
-                    int32_t * n_tokens,
-                    int32_t   n_max_tokens);
-
-    void bark_forward_text_encoder(
-        struct bark_context * ctx,
-                        int   n_threads);
-
-    void bark_forward_coarse_encoder(
-        struct bark_context * ctx,
-                        int   n_threads);
-
-    void bark_forward_fine_encoder(
-        struct bark_context * ctx,
-                        int   n_threads);
-
-    void bark_forward_encodec(struct bark_context * ctx);
-
-#endif // BARK_API_INTERNAL
-
-#endif  // BARK_H
diff --git a/CMakeLists.txt b/bark/CMakeLists.txt
similarity index 50%
rename from CMakeLists.txt
rename to bark/CMakeLists.txt
index 9e923a7..94d5711 100644
--- a/CMakeLists.txt
+++ b/bark/CMakeLists.txt
@@ -1,5 +1,5 @@
-cmake_minimum_required(VERSION 3.12) 
-project("bark.cpp" C CXX)
+cmake_minimum_required(VERSION 3.12)
+project("bark" C CXX)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
@@ -7,8 +7,7 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
 endif()
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_CXX_FLAGS_RELEASE "-O3")
 
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     set(BARK_STANDALONE ON)
@@ -16,34 +15,27 @@ else()
     set(BARK_STANDALONE OFF)
 endif()
 
-option(BARK_BUILD_TESTS                "bark: build tests"    ${BARK_STANDALONE})
-option(BARK_BUILD_EXAMPLES             "bark: build examples" ${BARK_STANDALONE})
+option(BARK_BUILD_TESTS    "bark: build tests"    ${BARK_STANDALONE})
+option(BARK_BUILD_EXAMPLES "bark: build examples" ${BARK_STANDALONE})
 
 # Build libraries
 
-add_subdirectory(ggml)
+set(BARK_LIB bark)
 
-set(BARK_LIB bark.cpp)
+# add_subdirectory(../ggml        ${CMAKE_BINARY_DIR}/ggml)
+add_subdirectory(../encodec.cpp ${CMAKE_BINARY_DIR}/encodec.cpp)
 
-add_library(
-        ${BARK_LIB}
-        bark
-        bark.cpp
-        bark.h
-        bark-util.h
-        encodec.cpp
-        encodec.h
-)
-
-target_link_libraries(${BARK_LIB} PUBLIC ggml)
-target_include_directories(${BARK_LIB} PUBLIC .)
-target_compile_features(${BARK_LIB} PUBLIC cxx_std_11)
+add_library(${BARK_LIB} bark.cpp bark.h)
 
 if (BARK_BUILD_EXAMPLES)
     add_subdirectory(examples)
 endif()
 
-if (BARK_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+if (BARK_BUILD_TESTS)
     include(CTest)
     add_subdirectory(tests)
 endif ()
+
+target_link_libraries(${BARK_LIB} PUBLIC ggml encodec)
+target_include_directories(${BARK_LIB} PUBLIC .)
+target_compile_features(${BARK_LIB} PUBLIC cxx_std_11)
diff --git a/bark/bark.cpp b/bark/bark.cpp
new file mode 100644
index 0000000..2ab7fe8
--- /dev/null
+++ b/bark/bark.cpp
@@ -0,0 +1,2333 @@
+/* Port of Suno's Bark to C/C++. */
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include "bark.h"
+#include "encodec.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <random>
+#include <regex>
+#include <string>
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#define EPS_NORM 1e-5f
+
+
+void print_tensor(struct ggml_tensor * a) {
+    float sum = 0;
+    float maxv = -INFINITY;
+    float minv = INFINITY;
+    if (a) {
+        for (int i = 0; i < a->ne[3]; i++) {
+            for (int j = 0; j < a->ne[2]; j++) {
+                for (int k = 0; k < a->ne[1]; k++) {
+                    for (int l = 0; l < a->ne[0]; l++) {
+                        if (a->type == GGML_TYPE_F32) {
+                            float * aval = (float *) (
+                                (char *) a->data + i*a->nb[3] + j*a->nb[2] + k*a->nb[1] + l*a->nb[0]);
+                            sum += *aval;
+                            maxv = MAX(*aval, maxv);
+                            minv = MIN(*aval, minv);
+                            // printf("%.4f ", *aval);
+                        } else if (a->type == GGML_TYPE_F16) {
+                            ggml_fp16_t * tmp = (ggml_fp16_t *) (
+                                (char *) a->data + i*a->nb[3] + j*a->nb[2] + k*a->nb[1] + l*a->nb[0]);
+                            float aval = ggml_fp16_to_fp32(*tmp);
+                            sum += aval;
+                            maxv = MAX(aval, maxv);
+                            minv = MIN(aval, minv);
+                            // printf("%.4f ", aval);
+                        } else if (a->type == GGML_TYPE_I32) {
+                            int32_t * aval = (int32_t *) (
+                                (char *) a->data + i*a->nb[3] + j*a->nb[2] + k*a->nb[1] + l*a->nb[0]);
+                            sum += (float) *aval;
+                            maxv = MAX((float) *aval, maxv);
+                            minv = MIN((float) *aval, minv);
+                            // printf("%d ", *aval);
+                        } else {
+                            throw std::runtime_error("Wrong tensor type.");
+                        }
+                    }
+                    // printf("\n");
+                }
+                // printf("\n\n");
+            }
+        }
+        printf("sum=%.2f; max=%.2f; min=%.2f\n", sum, maxv, minv);
+        printf("shape=[%lld, %lld, %lld, %lld]\n", a->ne[0], a->ne[1], a->ne[2], a->ne[3]);
+    }
+}
+
+class BarkProgressBar {
+    public:
+        BarkProgressBar(std::string func_name, double needed_progress) {
+            this->func_name = func_name;
+            this->needed_progress = needed_progress;
+        }
+
+        void update(double new_progress) {
+            current_progress += new_progress;
+            amount_of_filler = (int)((current_progress / needed_progress)*(double)pbar_length);
+        }
+        void print() {
+            printf("\r%s: %s", func_name.c_str(), initial_part.c_str());
+            for (int a = 0; a < amount_of_filler; a++) {
+                printf("%s", pbar_filler.c_str());
+            }
+            printf("%s", pbar_updater.c_str());
+            for (int b = 0; b < pbar_length - amount_of_filler; b++) {
+                printf(" ");
+            }
+            printf("%s (%d%%)", last_part.c_str(), (int)(100*(current_progress/needed_progress)));
+            fflush(stdout);
+        }
+
+        std::string initial_part = "[", last_part = "]";
+        std::string pbar_filler = "=", pbar_updater = ">";
+
+    private:
+        std::string func_name;
+        double needed_progress, current_progress = 0;
+        int amount_of_filler, pbar_length = 50;
+};
+
+template<typename T>
+static void read_safe(std::ifstream& fin, T& dest) {
+    fin.read((char*)& dest, sizeof(T));
+}
+
+template<typename T>
+static void write_safe(std::ofstream& fout, T& dest) {
+    fout.write((char*)& dest, sizeof(T));
+}
+
+static void bark_print_statistics(gpt_model * model) {
+    printf("\n\n");
+    printf("%s:   sample time = %8.2f ms / %lld tokens\n", __func__, model->t_sample_us/1000.0f, model->n_sample);
+    printf("%s:  predict time = %8.2f ms / %.2f ms per token\n", __func__, model->t_predict_us/1000.0f, model->t_predict_us/model->n_sample/1000.0f);
+    printf("%s:    total time = %8.2f ms\n", __func__, model->t_main_us/1000.0f);
+    printf("\n");
+}
+
+static void softmax(std::vector<float> & logits) {
+    // for numerical stability
+    float maxl = -INFINITY;
+    for (const auto & l : logits)
+        maxl = std::max(maxl, l);
+
+    // softmax
+    float sum = 0.0;
+    for (auto & l : logits) {
+        l = exp(l - maxl);
+        sum += l;
+    }
+
+    for (auto & l : logits)
+        l /= sum;
+}
+
+static bark_token gpt_multinomial_sample(
+        std::vector<float> & logits,
+              std::mt19937 & rng,
+                     float   temp,
+                     float * eos_p) {
+    int n_logits = logits.size();
+
+    for (int i = 0; i < n_logits; ++i)
+        logits[i] /= temp;
+
+    softmax(logits);
+
+    std::discrete_distribution<bark_token> dist(logits.begin(), logits.end());
+    int next = dist(rng);
+
+    // likelihood of EOS token
+    if (eos_p)
+        *eos_p = logits[logits.size() - 1];
+
+    return next;
+}
+
+static bark_token gpt_argmax_sample(std::vector<float> & logits, float * eos_p) {
+    int n_logits = logits.size();
+
+    // testing purposes
+    for (auto & l : logits) { l /= 0.7f; }
+
+    // likelihood of EOS token
+    softmax(logits);
+
+    if (eos_p)
+        *eos_p = logits[logits.size() - 1];
+
+    int next = 0;
+    float maxl = -INFINITY;
+
+    for (int i = 0; i < n_logits; i++) {
+        if (logits[i] > maxl) {
+            maxl = logits[i];
+            next = i;
+        }
+    }
+
+    return next;
+}
+
+static bark_token gpt_sample(
+            std::vector<float> & logits,
+                  std::mt19937 & rng,
+                         float   temp,
+                         float * eos_p,
+                       int64_t * t_sample_us,
+                       int64_t * n_sample) {
+    int64_t t_sample_start_us = ggml_time_us();
+
+    bark_token res;
+    if (temp == 0.0f) {
+        res = gpt_argmax_sample(logits, eos_p);
+    } else {
+        res = gpt_multinomial_sample(logits, rng, temp, eos_p);
+    }
+
+    int64_t t_sample_end_us = ggml_time_us();
+    *t_sample_us += (t_sample_end_us - t_sample_start_us);
+    *n_sample += 1;
+
+    return res;
+}
+
+bool bark_vocab_load(
+     const std::string & fname,
+            bark_vocab * vocab,
+               int32_t   expected_size) {
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    int32_t n_vocab;
+    read_safe(fin, n_vocab);
+
+    // 5 special tokens: [UNK, SEP, MASK, PAD, CLS]
+    if (n_vocab != expected_size) {
+        fprintf(stderr, "%s: wrong voculary size (%d != %d)\n", __func__, n_vocab, expected_size);
+        return false;
+    }
+
+    std::string word;
+    std::vector<char> tmp;
+
+    tmp.reserve(128);
+
+    for (int i = 0; i < n_vocab; i++) {
+        uint32_t len;
+        read_safe(fin, len);
+
+        if (len > 0) {
+            tmp.resize(len);
+            fin.read(&tmp[0], tmp.size()); // read to buffer
+            word.assign(&tmp[0], tmp.size());
+        } else {
+            word = "";
+        }
+
+        vocab->token_to_id[word] = i;
+        vocab->id_to_token[i] = word;
+    }
+
+    return true;
+}
+
+static size_t utf8_len(char src) {
+    const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
+static std::string strip_accents(const std::string & in_str) {
+    std::string out_str;
+    std::map<std::string, char> accent_map = {{"À", 'A'},{"Á", 'A'},
+        {"Â", 'A'},{"Ã", 'A'},{"Ä", 'A'},{"Å", 'A'},{"à", 'a'},{"á", 'a'},
+        {"â", 'a'},{"ã", 'a'},{"ä", 'a'},{"å", 'a'},{"È", 'E'},{"É", 'E'},
+        {"Ê", 'E'},{"Ë", 'E'},{"è", 'e'},{"é", 'e'},{"ê", 'e'},{"ë", 'e'},
+        {"Ì", 'I'},{"Í", 'I'},{"Î", 'I'},{"Ï", 'I'},{"ì", 'i'},{"í", 'i'},
+        {"î", 'i'},{"ï", 'i'},{"Ò", 'O'},{"Ó", 'O'},{"Ô", 'O'},{"Õ", 'O'},
+        {"Ö", 'O'},{"ò", 'o'},{"ó", 'o'},{"ô", 'o'},{"õ", 'o'},{"ö", 'o'},
+        {"Ù", 'U'},{"Ú", 'U'},{"Û", 'U'},{"Ü", 'U'},{"ù", 'u'},{"ú", 'u'},
+        {"û", 'u'},{"ü", 'u'},{"Ý", 'Y'},{"ý", 'y'},{"Ç", 'C'},{"ç", 'c'},
+        {"Ñ", 'N'},{"ñ", 'n'},
+    };
+
+    for (size_t i = 0; i < in_str.length();) {
+        int len = utf8_len(in_str[i]);
+        std::string cur = in_str.substr(i, len);
+        auto iter = accent_map.find(cur);
+        if (iter != accent_map.end())
+            out_str += iter->second;
+        else
+            out_str += cur;
+
+        i += len;
+    }
+
+    return out_str;
+}
+
+void bert_tokenize(
+        const bark_vocab * vocab,
+              const char * text,
+                 int32_t * tokens,
+                 int32_t * n_tokens,
+                 int32_t   n_max_tokens) {
+    std::string str = text;
+    std::vector<std::string> words;
+
+    int32_t t = 0;
+
+    auto * token_map = &vocab->token_to_id;
+
+    // split the text into words
+    {
+        str = strip_accents(text);
+
+        std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)";
+
+        std::regex re(pat);
+        std::smatch m;
+
+        while (std::regex_search(str, m, re)) {
+            for (std::string x : m)
+                words.push_back(x);
+            str = m.suffix();
+        }
+    }
+
+    // apply wordpiece
+    for (const auto &word : words) {
+        if (word.size() == 0)
+            continue;
+
+        std::string prefix = "";
+        int i = 0;
+        int n = word.size();
+
+        loop:
+            while (i < n) {
+                if (t >= n_max_tokens - 1)
+                    break;
+                int j = n;
+                while (j > i) {
+                    auto it = token_map->find(prefix + word.substr(i, j - i));
+                    if (it != token_map->end()) {
+                        tokens[t++] = it->second;
+                        i = j;
+                        prefix = "##";
+                        goto loop;
+                    }
+                    --j;
+                }
+                if (j == i) {
+                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
+                    prefix = "##";
+                    ++i;
+                }
+            }
+        }
+
+    *n_tokens = t;
+}
+
+static void bark_tokenize_input(struct bark_context * bctx, const std::string & text) {
+    auto & model = bctx->model.text_model;
+    bark_vocab * vocab = &bctx->model.vocab;
+
+    auto & params = bctx->params;
+
+    int32_t block_size = model.hparams.block_size;
+    int32_t max_ctx_size = std::min(block_size, 256);
+    int32_t n_tokens;
+
+    bark_sequence tokens(max_ctx_size);
+    bert_tokenize(vocab, text.data(), tokens.data(), &n_tokens, max_ctx_size);
+
+    for (int i = 0; i < (int) tokens.size(); i++)
+        tokens[i] += params.text_encoding_offset;
+
+    if (n_tokens < max_ctx_size) {
+        for (int i = n_tokens; i < max_ctx_size; i++)
+            tokens[i] = params.text_pad_token;
+    } else if (n_tokens > max_ctx_size) {
+        fprintf(stderr, "%s: input sequence is too long (%d > 256), truncating sequence", __func__, n_tokens);
+    }
+
+    tokens.resize(max_ctx_size);
+
+    // semantic history
+    for (int i = 0; i < 256; i++)
+        tokens.push_back(params.semantic_pad_token);
+    tokens.push_back(params.semantic_infer_token);
+
+    assert(tokens.size() == 256 + 256 + 1);
+
+    bctx->tokens = tokens;
+
+    printf("%s: prompt: '%s'\n", __func__, text.c_str());
+    printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, bctx->tokens.size());
+    for (int i = 0; i < std::min(8, (int) bctx->tokens.size()); i++) {
+        printf("%d ", bctx->tokens[i]);
+    }
+    printf("\n\n");
+}
+
+static bool gpt_load_model_weights(
+            const std::string & fname,
+                    gpt_model & model,
+                          int   n_gpu_layers,
+         bark_verbosity_level  verbosity) {
+    if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+        fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str());
+    }
+
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+            return false;
+        }
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        read_safe(fin, hparams.n_layer);
+        read_safe(fin, hparams.n_head);
+        read_safe(fin, hparams.n_embd);
+        read_safe(fin, hparams.block_size);
+        read_safe(fin, hparams.bias);
+        read_safe(fin, hparams.n_in_vocab);
+        read_safe(fin, hparams.n_out_vocab);
+        read_safe(fin, hparams.n_lm_heads);
+        read_safe(fin, hparams.n_wtes);
+        read_safe(fin, hparams.ftype);
+
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+        if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+            printf("%s: n_in_vocab  = %d\n", __func__, hparams.n_in_vocab);
+            printf("%s: n_out_vocab = %d\n", __func__, hparams.n_out_vocab);
+            printf("%s: block_size  = %d\n", __func__, hparams.block_size);
+            printf("%s: bias        = %d\n", __func__, hparams.bias);
+            printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+            printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+            printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+            printf("%s: n_lm_heads  = %d\n", __func__, hparams.n_lm_heads);
+            printf("%s: n_wtes      = %d\n", __func__, hparams.n_wtes);
+            printf("%s: ftype       = %d\n", __func__, hparams.ftype);
+            printf("%s: qntvr       = %d\n", __func__, qntvr);
+        }
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t buffer_size = 0;
+    size_t n_tensors   = 0;
+
+    // Evaluating context size
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd      = hparams.n_embd;
+        const int n_layer     = hparams.n_layer;
+        const int block_size  = hparams.block_size;
+        const int n_in_vocab  = hparams.n_in_vocab;
+        const int n_out_vocab = hparams.n_out_vocab;
+        const int n_lm_heads  = hparams.n_lm_heads;
+        const int n_wtes      = hparams.n_wtes;
+        const int bias        = hparams.bias;
+
+        buffer_size += n_embd * ggml_type_size(GGML_TYPE_F32); // ln_f_g
+
+        buffer_size += n_wtes * n_in_vocab * n_embd * ggml_type_size(wtype);      // wtes
+        buffer_size += block_size * n_embd * ggml_type_size(GGML_TYPE_F32);       // wpe
+        buffer_size += n_lm_heads * n_out_vocab * n_embd * ggml_type_size(wtype); // lm_head
+
+        buffer_size += n_layer * (n_embd * ggml_type_size(GGML_TYPE_F32)); // ln_1_g
+        buffer_size += n_layer * (n_embd * ggml_type_size(GGML_TYPE_F32)); // ln_2_g
+
+        buffer_size += n_layer * (3 * n_embd * n_embd * ggml_type_size(wtype)); // c_attn_attn_w
+        buffer_size += n_layer * (    n_embd * n_embd * ggml_type_size(wtype)); // c_attn_proj_w
+
+        buffer_size += n_layer * (4 * n_embd * n_embd * ggml_type_size(wtype)); // c_mlp_fc_w
+        buffer_size += n_layer * (4 * n_embd * n_embd * ggml_type_size(wtype)); // c_mlp_proj_w
+
+        if (bias) {
+            buffer_size += n_embd * ggml_type_size(GGML_TYPE_F32); // ln_f_b
+
+            buffer_size += n_layer * (n_embd * ggml_type_size(GGML_TYPE_F32)); // ln_1_b
+            buffer_size += n_layer * (n_embd * ggml_type_size(GGML_TYPE_F32)); // ln_2_b
+
+            buffer_size += n_layer * (3 * n_embd * ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
+            buffer_size += n_layer * (    n_embd * ggml_type_size(GGML_TYPE_F32)); // c_attn_proj_b
+
+            buffer_size += n_layer * (4 * n_embd * ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
+            buffer_size += n_layer * (    n_embd * ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
+        }
+
+        buffer_size += 10ull*MB;  // object overhead
+
+        n_tensors = (
+            1           + // ln_f_g
+            n_wtes + 1  + // wtes, wpe
+            2 * n_layer + // ln_1_g, ln_2_g
+            2 * n_layer + // c_attn_attn_w, c_attn_proj_w
+            2 * n_layer + // c_mlp_fc_w, c_mlp_proj_w
+            n_lm_heads  + // lm_head
+            2             // memory_k, memory_v
+        );
+
+        if (bias) {
+            n_tensors += 1;  // ln_f_b
+            n_tensors += 2 * n_layer;  // ln_1_b, ln_2_b
+            n_tensors += 4 * n_layer;  // c_attn_attn_b, c_attn_proj_b, c_mlp_fc_b, c_mlp_proj_b
+        }
+
+        if (verbosity == bark_verbosity_level::HIGH) {
+            printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+            printf("%s: ggml ctx size = %6.2f MB\n", __func__, buffer_size/(1024.0*1024.0));
+        }
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ggml_tensor_overhead() * n_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx) {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            return false;
+        }
+    }
+
+#ifdef GGML_USE_CUBLAS
+    if (n_gpu_layers > 0) {
+        fprintf(stderr, "%s: using CUDA backend\n", __func__);
+        model.backend = ggml_backend_cuda_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+        }
+    }
+#endif
+
+#ifdef GGML_USE_METAL
+    if (n_gpu_layers > 0) {
+        fprintf(stderr, "%s: using Metal backend\n", __func__);
+        ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+        model.backend = ggml_backend_metal_init();
+        if (!model.backend) {
+            fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+        }
+    }
+#endif
+
+    if (!model.backend) {
+        // fallback to CPU backend
+        if (verbosity == bark_verbosity_level::HIGH) {
+            fprintf(stderr, "%s: no backend specified, using CPU backend\n", __func__);
+        }
+        model.backend = ggml_backend_cpu_init();
+    }
+
+    if (!model.backend) {
+        if (verbosity == bark_verbosity_level::HIGH) {
+            fprintf(stderr, "%s: failed to initialize CPU backend\n", __func__);
+        }
+
+        return false;
+    }
+
+    // allocate weights buffer
+    model.buffer_w = ggml_backend_alloc_buffer(model.backend, buffer_size);
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd      = hparams.n_embd;
+        const int n_layer     = hparams.n_layer;
+        const int block_size  = hparams.block_size;
+        const int n_in_vocab  = hparams.n_in_vocab;
+        const int n_out_vocab = hparams.n_out_vocab;
+        const int n_lm_heads  = hparams.n_lm_heads;
+        const int n_wtes      = hparams.n_wtes;
+        const int bias        = hparams.bias;
+
+        model.layers.resize(n_layer);
+        model.lm_heads.resize(n_lm_heads);
+        model.wtes.resize(n_wtes);
+
+        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        if (bias) {
+            model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        }
+
+        model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, block_size);
+
+        for (int i = 0; i < n_wtes; i++) {
+            model.wtes[i] = ggml_new_tensor_2d(ctx, wtype, n_embd, n_in_vocab);
+            model.tensors["model/wte/" + std::to_string(i)] = model.wtes[i];
+        }
+
+        for (int i = 0; i < n_lm_heads; i++) {
+            model.lm_heads[i] = ggml_new_tensor_2d(ctx, wtype, n_embd, n_out_vocab);
+            model.tensors["model/lm_head/" + std::to_string(i)] = model.lm_heads[i];
+        }
+
+        model.tensors["model/ln_f/g"] = model.ln_f_g;
+        model.tensors["model/ln_f/b"] = model.ln_f_b;
+
+        model.tensors["model/wpe"]    = model.wpe;
+
+        for (int i = 0; i < n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
+            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
+
+            if (bias) {
+                layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+                layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+                layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+                layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+
+                layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+                layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            }
+
+            // map by name
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
+            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
+
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+
+        const int n_embd     = hparams.n_embd;
+        const int n_layer    = hparams.n_layer;
+        const int block_size = hparams.block_size;
+
+        const int n_lm_heads = hparams.n_lm_heads;
+        const int n_wtes     = hparams.n_wtes;
+
+        const int n_mem      = n_layer*block_size;
+        const int n_elements = n_embd*n_mem;
+
+        if (n_lm_heads == 1 && n_wtes == 1) {
+            // hack: if one LM head and one token embedding layer, we are loading weights
+            // of the text and coarse encoder. In this case, we need KV cache.
+            // for fine encoder, no need for KV cache, skip this part.
+            model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+            model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+
+            const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+            if (verbosity == bark_verbosity_level::HIGH) {
+                printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+            }
+
+            // create a backend buffer (can be in host or device memory)
+            model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
+
+            // allocate the tensors into the backend buffer
+            {
+                ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
+
+                // this updates the pointers in the tensors to point to the correct location in the buffer
+                // this is necessary since the ggml_context is .no_alloc == true
+                // note that the buffer can actually be a device buffer, depending on the backend
+                ggml_allocr_alloc(alloc, model.memory_k);
+                ggml_allocr_alloc(alloc, model.memory_v);
+
+                ggml_allocr_free(alloc);
+            }
+        }
+    }
+
+    // load weights
+    {
+        ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w);
+
+        size_t total_size = 0;
+
+        std::vector<char> read_buf;
+
+        while(true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ttype;
+
+            read_safe(fin, n_dims);
+            read_safe(fin, length);
+            read_safe(fin, ttype);
+
+            if (fin.eof()) {
+                break;
+            }
+
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                read_safe(fin, ne[i]);
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                return false;
+            }
+
+            auto tensor = model.tensors[name];
+            ggml_set_name(tensor, name.c_str());
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return false;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+
+
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                return false;
+            }
+
+            ggml_allocr_alloc(alloc, tensor);
+
+            if (ggml_backend_is_cpu(model.backend)
+#ifdef GGML_USE_METAL
+                || ggml_backend_is_metal(model.backend)
+#endif
+            ) {
+                // for the CPU and Metal backends, we can read directly into the device memory
+                fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+            } else {
+                // read into a temporary buffer first, then copy to device memory
+                read_buf.resize(ggml_nbytes(tensor));
+                fin.read(read_buf.data(), ggml_nbytes(tensor));
+                ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
+            }
+
+            if (verbosity == bark_verbosity_level::HIGH) {
+                printf("%48s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], "float", ggml_nbytes(tensor)/1024.0/1024.0);
+            }
+
+            total_size += ggml_nbytes(tensor);
+        }
+
+        ggml_allocr_free(alloc);
+
+        if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+            printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+        }
+
+        model.memsize = total_size;
+    }
+
+    fin.close();
+
+    return true;
+}
+
+static bool ggml_quantize_weights(
+                         std::ifstream & fin,
+                         std::ofstream & fout,
+                      const ggml_ftype   ftype,
+        const std::vector<std::string> & to_quant,
+        const std::vector<std::string> & to_skip) {
+    ggml_type qtype = GGML_TYPE_F32;
+
+    switch (ftype) {
+        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
+        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
+        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
+        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
+        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
+        case GGML_FTYPE_UNKNOWN:
+        case GGML_FTYPE_ALL_F32:
+        case GGML_FTYPE_MOSTLY_F16:
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
+        case GGML_FTYPE_MOSTLY_Q2_K:
+        case GGML_FTYPE_MOSTLY_Q3_K:
+        case GGML_FTYPE_MOSTLY_Q4_K:
+        case GGML_FTYPE_MOSTLY_Q5_K:
+        case GGML_FTYPE_MOSTLY_Q6_K:
+                {
+                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
+                    return false;
+                }
+    };
+
+    if (!ggml_is_quantized(qtype)) {
+        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
+        return false;
+    }
+
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    std::vector<float> work;
+
+    std::vector<uint8_t>     data_u8;
+    std::vector<ggml_fp16_t> data_f16;
+    std::vector<float>       data_f32;
+
+    std::vector<int64_t> hist_all(1 << 4, 0);
+
+    while (true) {
+        int32_t n_dims;
+        int32_t length;
+        int32_t ttype;
+
+        read_safe(fin, n_dims);
+        read_safe(fin, length);
+        read_safe(fin, ttype);
+
+        if (fin.eof()) {
+            break;
+        }
+
+        int32_t nelements = 1;
+        int32_t ne[4] = { 1, 1, 1, 1 };
+        for (int i = 0; i < n_dims; ++i) {
+            read_safe(fin, ne[i]);
+            nelements *= ne[i];
+        }
+
+        std::string name(length, 0);
+        fin.read(&name[0], length);
+
+        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
+
+        bool quantize = false;
+
+        // check if we should quantize this tensor
+        for (const auto & s : to_quant) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = true;
+                break;
+            }
+        }
+
+        // check if we should skip this tensor
+        for (const auto & s : to_skip) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = false;
+                break;
+            }
+        }
+
+        // quantize only 2D tensors
+        quantize &= (n_dims == 2);
+
+        if (quantize) {
+            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
+                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                return false;
+            }
+
+            if (ttype == GGML_TYPE_F16) {
+                data_f16.resize(nelements);
+                fin.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
+                data_f32.resize(nelements);
+                for (int i = 0; i < nelements; ++i) {
+                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
+                }
+            } else {
+                data_f32.resize(nelements);
+                fin.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
+            }
+
+            ttype = qtype;
+        } else {
+            const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
+
+            data_u8.resize(nelements*bpe);
+            fin.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
+        }
+
+        write_safe(fout, n_dims);
+        write_safe(fout, length);
+        write_safe(fout, ttype);
+        for (int i = 0; i < n_dims; ++i) {
+            write_safe(fout, ne[i]);
+        }
+        fout.write(&name[0], length);
+
+        if (quantize) {
+            work.resize(nelements); // for quantization
+
+            size_t cur_size = 0;
+            std::vector<int64_t> hist_cur(1 << 4, 0);
+
+            switch ((ggml_type) ttype) {
+                case GGML_TYPE_Q4_0:
+                    {
+                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q4_1:
+                    {
+                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_0:
+                    {
+                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_1:
+                    {
+                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q8_0:
+                    {
+                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_I8:
+                case GGML_TYPE_I16:
+                case GGML_TYPE_I32:
+                case GGML_TYPE_Q8_1:
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
+                case GGML_TYPE_Q4_K:
+                case GGML_TYPE_Q5_K:
+                case GGML_TYPE_Q6_K:
+                case GGML_TYPE_Q8_K:
+                case GGML_TYPE_COUNT:
+                    {
+                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                        return false;
+                    }
+            }
+
+            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
+            total_size_new += cur_size;
+
+            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
+                hist_all[i] += hist_cur[i];
+            }
+
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
+                printf("%5.3f ", hist_cur[i] / (float)nelements);
+            }
+            printf("\n");
+        } else {
+            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
+            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
+            total_size_new += data_u8.size();
+        }
+
+        total_size_org += nelements * sizeof(float);
+    }
+
+    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
+    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
+
+    {
+        int64_t sum_all = 0;
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
+            sum_all += hist_all[i];
+        }
+
+        printf("%s: hist: ", __func__);
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
+            printf("%5.3f ", hist_all[i] / (float)sum_all);
+        }
+        printf("\n");
+    }
+
+    return true;
+}
+
+static struct ggml_cgraph * bark_build_gpt_graph(
+                gpt_model * model,
+              ggml_allocr * allocr,
+            bark_sequence & tokens,
+                      int * n_past,
+                     bool   merge_ctx,
+                      int   n_threads) {
+    if (!n_past) {
+        fprintf(stderr, "%s: n_past is null\n", __func__);
+        return NULL;
+    }
+
+    int N = tokens.size();
+
+    const auto & hparams = model->hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.block_size;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_out_vocab;
+    const int bias    = hparams.bias;
+
+    static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params ggml_params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(ggml_params);
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * input = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, input);
+
+    // avoid writing to tensors if we are only measuring the memory usage
+    if (!ggml_allocr_is_measure(allocr)) {
+        ggml_backend_tensor_set(input, tokens.data(), 0, N*ggml_element_size(input));
+    }
+
+    struct ggml_tensor * tok_emb;
+
+    if (*n_past > 0) {
+        assert(N == 1);
+        tok_emb = ggml_get_rows(ctx0, model->wtes[0], input);
+    } else {
+        if (merge_ctx) {
+            assert(N == 256+256+1);
+            N -= 256;
+        } else {
+            assert(N <= n_ctx);
+        }
+
+        if (merge_ctx) {
+            struct ggml_tensor * seq_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input, 256, 0));
+            struct ggml_tensor * ctx_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input, 256, 256*ggml_element_size(input)));
+            struct ggml_tensor * rem_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input,   1, 512*ggml_element_size(input)));
+
+            struct ggml_tensor * cat_emb = ggml_add(ctx0, seq_embd, ctx_embd);
+
+            tok_emb = ggml_new_tensor_2d(ctx0, cat_emb->type, cat_emb->ne[0], cat_emb->ne[1]+rem_embd->ne[1]);
+            ggml_allocr_alloc(allocr, tok_emb);
+
+            tok_emb = ggml_set_1d(ctx0, tok_emb, cat_emb, 0);
+            tok_emb = ggml_set_1d(ctx0, tok_emb, rem_embd, cat_emb->ne[0]*cat_emb->ne[1]*ggml_element_size(cat_emb));
+        } else {
+            tok_emb = ggml_get_rows(ctx0, model->wtes[0], input);
+        }
+    }
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, position);
+    if (!ggml_allocr_is_measure(allocr)) {
+        for (int i = 0; i < N; ++i) {
+            int32_t v = *n_past + i;
+            ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
+        }
+    }
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(allocr, KQ_scale);
+    if (!ggml_allocr_is_measure(allocr)) {
+        float s = 1.0f/sqrtf(float(n_embd)/n_head);
+        ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s));
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL = ggml_add(ctx0, tok_emb, ggml_get_rows(ctx0, model->wpe, position));
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            cur = ggml_norm(ctx0, inpL, EPS_NORM);
+
+            // cur = ln_1_g*cur + ln_1_b
+            cur = ggml_mul(ctx0, cur, model->layers[il].ln_1_g);
+
+            if (bias) {
+                cur = ggml_add(ctx0, cur, model->layers[il].ln_1_b);
+            }
+        }
+
+        // attn
+        {
+            cur = ggml_mul_mat(ctx0,
+                    model->layers[il].c_attn_attn_w,
+                    cur);
+
+            if (bias) {
+                cur = ggml_add(ctx0, cur, model->layers[il].c_attn_attn_b);
+            }
+        }
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+
+            // store key and value to memory
+            if (N >= 1) {
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model->memory_k, N*n_embd, (ggml_element_size(model->memory_k)*n_embd)*(il*n_ctx + *n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx0, model->memory_v, N*n_embd, (ggml_element_size(model->memory_v)*n_embd)*(il*n_ctx + *n_past));
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model->memory_k, (*n_past + N)*n_embd, il*n_ctx*ggml_element_size(model->memory_k)*n_embd),
+                            n_embd/n_head, n_head, *n_past + N),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
+
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, *n_past);
+
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+
+            struct ggml_tensor * V_trans =
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model->memory_v, (*n_past + N)*n_embd, il*n_ctx*ggml_element_size(model->memory_v)*n_embd),
+                                n_embd/n_head, n_head, *n_past + N),
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, model->memory_v->type, *n_past + N, n_embd/n_head, n_head));
+
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+        }
+
+        // projection
+        {
+            cur = ggml_mul_mat(ctx0, model->layers[il].c_attn_proj_w, cur);
+
+            if (bias) {
+                cur = ggml_add(ctx0, cur, model->layers[il].c_attn_proj_b);
+            }
+        }
+
+        // add the input
+        cur = ggml_add(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward network
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, EPS_NORM);
+
+                // cur = ln_2_g*cur + ln_2_b
+                cur = ggml_mul(ctx0, cur, model->layers[il].ln_2_g);
+
+                if (bias) {
+                    cur = ggml_add(ctx0, cur, model->layers[il].ln_2_b);
+                }
+            }
+
+            // cur = fc_w*cur + fc_b
+            cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_fc_w, cur);
+
+            if (bias) {
+                cur = ggml_add(ctx0, cur, model->layers[il].c_mlp_fc_b);
+            }
+
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_proj_w, cur);
+
+            if (bias) {
+                cur = ggml_add(ctx0, cur, model->layers[il].c_mlp_proj_b);
+            }
+        }
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, inpL, EPS_NORM);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        inpL = ggml_mul(ctx0, inpL, model->ln_f_g);
+
+        if (bias) {
+            inpL = ggml_add(ctx0, inpL, model->ln_f_b);
+        }
+    }
+
+    inpL = ggml_mul_mat(ctx0,
+            model->lm_heads[0],
+            ggml_view_1d(ctx0, inpL, inpL->ne[0], (inpL->ne[1]-1)*inpL->nb[1]));
+
+    ggml_build_forward_expand(gf, inpL);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+static ggml_cgraph * bark_build_fine_gpt_graph(
+                gpt_model * model,
+              ggml_allocr * allocr,
+            bark_sequence & tokens,
+                      int   codebook_idx,
+                      int   n_fine_codebooks,
+                      int   n_threads) {
+    // tokens: [n_channels, N]
+    const int N = tokens.size() / n_fine_codebooks;
+    const int n_channels = n_fine_codebooks;
+
+    const auto & hparams = model->hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.block_size;
+    const int n_head  = hparams.n_head;
+
+    const int n_codes_given = hparams.n_codes_given;
+
+    assert(N <= n_ctx);
+    assert(codebook_idx > 0);
+
+    static size_t buf_size = ggml_tensor_overhead() * GGML_MAX_NODES + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params ggml_params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    struct ggml_context * ctx0 = ggml_init(ggml_params);
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, N, n_channels);
+    ggml_allocr_alloc(allocr, input);
+
+    struct ggml_tensor * tok_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
+    ggml_allocr_alloc(allocr, tok_emb);
+
+    if (!ggml_allocr_is_measure(allocr)) {
+        ggml_backend_tensor_set(input, tokens.data(), 0, N*n_channels*ggml_element_size(input));
+        ggml_set_zero(tok_emb);
+    }
+
+    for (int wte_ix = 0; wte_ix < codebook_idx + 1; wte_ix++) {
+        struct ggml_tensor * cur = ggml_get_rows(ctx0,
+                        model->wtes[wte_ix],
+                        ggml_view_1d(ctx0, input, N, wte_ix*input->nb[1]));
+
+        tok_emb = ggml_add(ctx0, tok_emb, cur);
+    }
+
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    ggml_allocr_alloc(allocr, position);
+    if (!ggml_allocr_is_measure(allocr)) {
+        for (int i = 0; i < N; ++i) {
+            ggml_backend_tensor_set(position, &i, i*sizeof(int32_t), sizeof(int32_t));
+        }
+    }
+
+    struct ggml_tensor * pos_emb = ggml_get_rows(ctx0, model->wpe, position);
+
+    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
+    ggml_allocr_alloc(allocr, KQ_scale);
+    if (!ggml_allocr_is_measure(allocr)) {
+        float s = 1.0f/sqrtf(float(n_embd)/n_head);
+        ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s));
+    }
+
+    // wte + wpe
+    struct ggml_tensor * inpL = ggml_add(ctx0, tok_emb, pos_emb);
+
+    for (int il = 0; il < n_layer; il++) {
+        struct ggml_tensor * cur;
+
+        // norm
+        {
+            cur = ggml_norm(ctx0, inpL, EPS_NORM);
+
+            // cur = ln_1_g*cur + ln_1_b
+            cur = ggml_mul(ctx0, cur, model->layers[il].ln_1_g);
+            cur = ggml_add(ctx0, cur, model->layers[il].ln_1_b);
+        }
+
+        // self-attention
+        {
+            // cur = attn_w*cur
+            cur = ggml_mul_mat(ctx0, model->layers[il].c_attn_attn_w, cur);
+
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+
+            struct ggml_tensor * Q =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Qcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_cpy(ctx0,
+                            Kcur,
+                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                        0, 2, 1, 3);
+
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
+
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
+
+            struct ggml_tensor * V_trans =
+                ggml_cont(ctx0,
+                    ggml_permute(ctx0,
+                            ggml_cpy(ctx0,
+                                Vcur,
+                                ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                            1, 2, 0, 3));
+
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // [n_embd, N]
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // cur = proj_w*cur
+            cur = ggml_mul_mat(ctx0, model->layers[il].c_attn_proj_w, cur);
+        }
+
+        // residual connection
+        cur = ggml_add(ctx0, cur, inpL);
+
+        struct ggml_tensor * inpFF = cur;
+
+        // feed-forward
+        {
+            // norm
+            {
+                cur = ggml_norm(ctx0, inpFF, EPS_NORM);
+
+                cur = ggml_mul(ctx0, cur, model->layers[il].ln_2_g);
+                cur = ggml_add(ctx0, cur, model->layers[il].ln_2_b);
+            }
+
+            // cur = fc_w*cur
+            cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_fc_w, cur);
+
+            // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+
+            // cur = proj_w*cur
+            cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_proj_w, cur);
+        }
+
+        inpL = ggml_add(ctx0, cur, inpFF);
+    }
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, inpL, EPS_NORM);
+
+        inpL = ggml_mul(ctx0, inpL, model->ln_f_g);
+        inpL = ggml_add(ctx0, inpL, model->ln_f_b);
+    }
+
+    // inpL = WTE * inpL
+    struct ggml_tensor * lm_head = model->lm_heads[codebook_idx - n_codes_given];
+    inpL = ggml_mul_mat(ctx0, lm_head, inpL);
+
+    ggml_build_forward_expand(gf, inpL);
+
+    ggml_free(ctx0);
+
+    return gf;
+}
+
+static bool bark_eval_encoder_internal(
+                                    gpt_model & model,
+                                  ggml_allocr * allocr,
+                                bark_sequence & input,
+                           std::vector<float> & logits,
+                                          int * n_past,
+                                         bool   merge_ctx,
+                                          int   n_threads) {
+    auto & hparams = model.hparams;
+    const int n_vocab = hparams.n_out_vocab;
+
+    const int64_t t_predict_us_start = ggml_time_us();
+
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+
+    struct ggml_cgraph * gf = bark_build_gpt_graph(
+        &model, allocr, input, n_past, merge_ctx, n_threads);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+
+    // run the computation
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(model.backend)) {
+        ggml_backend_metal_set_n_cb(model.backend, n_threads);
+    }
+#endif
+    ggml_backend_graph_compute(model.backend, gf);
+
+    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+
+    int N = input.size();
+    if (merge_ctx && *n_past == 0) {
+        N -= 256;
+    }
+
+    logits.resize(n_vocab);
+    ggml_backend_tensor_get(inpL, logits.data(), 0, sizeof(float)*n_vocab);
+
+    // updating n_past with N (-256 if merge_ctx)
+    if (n_past) {
+        *n_past += N;
+    }
+
+    model.t_predict_us += ggml_time_us() - t_predict_us_start;
+
+    return true;
+
+}
+
+static bool bark_eval_fine_encoder_internal(
+                          struct bark_context * bctx,
+                                bark_sequence & input,
+                           std::vector<float> & logits,
+                                          int   n_threads,
+                                          int   nn) {
+    auto & model   = bctx->model.fine_model;
+    auto & allocr  = bctx->allocr;
+    auto & hparams = model.hparams;
+    auto & params  = bctx->params;
+
+    const int n_vocab    = hparams.n_out_vocab;
+    const int block_size = hparams.block_size;
+
+    const int n_fine_codebooks = params.n_fine_codebooks;
+
+    const int64_t t_predict_us_start = ggml_time_us();
+
+    // reset the allocator to free all the memory allocated during the previous inference
+    ggml_allocr_reset(allocr);
+
+    struct ggml_cgraph * gf = bark_build_fine_gpt_graph(
+        &model, allocr, input, nn, n_fine_codebooks, n_threads);
+
+    // allocate tensors
+    ggml_allocr_alloc_graph(allocr, gf);
+
+    // run the computation
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(model.backend)) {
+        ggml_backend_metal_set_n_cb(model.backend, n_threads);
+    }
+#endif
+    ggml_backend_graph_compute(model.backend, gf);
+
+    struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+
+    ggml_backend_tensor_get(inpL, logits.data(), 0, sizeof(float)*n_vocab*block_size);
+
+    model.t_predict_us += ggml_time_us() - t_predict_us_start;
+
+    return true;
+}
+
+static bool bark_eval_text_encoder(struct bark_context * bctx, int n_threads) {
+    bark_sequence input = bctx->tokens;
+    bark_sequence output;
+
+    auto & params = bctx->params;
+
+    int32_t n_steps_text_encoder = params.n_steps_text_encoder;
+    int32_t semantic_vocab_size  = params.semantic_vocab_size;
+    int32_t semantic_pad_token   = params.semantic_pad_token;
+
+    BarkProgressBar progress(std::string("Generating semantic tokens"), n_steps_text_encoder);
+
+    auto & model   = bctx->model.text_model;
+    auto & allocr  = bctx->allocr;
+    auto & hparams = model.hparams;
+
+    const int n_vocab = hparams.n_out_vocab;
+
+    float min_eos_p = bctx->params.min_eos_p;
+    float temp      = bctx->params.temp;
+
+    std::vector<float> logits;
+    logits.resize(n_vocab);
+
+    float eos_p = 0;
+    int n_past = 0;
+
+    for (int i = 0; i < n_steps_text_encoder; i++) {
+        if (!bark_eval_encoder_internal(model, allocr, input, logits, &n_past, true, n_threads)) {
+            fprintf(stderr, "%s: Could not generate token\n", __func__);
+            return false;
+        }
+
+        std::vector<float> relevant_logits(logits.begin(), logits.begin() + semantic_vocab_size);
+        relevant_logits.push_back(logits[semantic_pad_token]);
+
+        input.clear();
+
+        bark_token next = gpt_sample(
+            logits, bctx->rng, temp, &eos_p, &model.t_sample_us, &model.n_sample);
+
+        if (next == semantic_vocab_size || eos_p >= min_eos_p) {
+            break;
+        }
+
+        input.push_back(next);
+        output.push_back(next);
+
+        progress.update(1);
+        progress.print();
+    }
+
+    bctx->semantic_tokens = output;
+
+    return true;
+}
+
+static bool bark_eval_coarse_encoder(struct bark_context * bctx, int n_threads) {
+    bark_codes out_coarse;
+    bark_sequence out;
+
+    bark_sequence input = bctx->semantic_tokens;
+
+    auto & model   = bctx->model.coarse_model;
+    auto & allocr  = bctx->allocr;
+    auto & hparams = model.hparams;
+    auto & params  = bctx->params;
+
+    const int n_vocab = hparams.n_out_vocab;
+
+    std::vector<float> logits;
+    logits.resize(n_vocab);
+
+    int max_coarse_history  = params.max_coarse_history;
+    int sliding_window_size = params.sliding_window_size;
+    int n_coarse_codebooks  = params.n_coarse_codebooks;
+    int semantic_vocab_size = params.semantic_vocab_size;
+    int codebook_size       = params.codebook_size;
+
+    float coarse_rate_hz    = params.coarse_rate_hz;
+    float semantic_rate_hz  = params.semantic_rate_hz;
+
+    int32_t coarse_semantic_pad_token = params.coarse_semantic_pad_token;
+    int32_t coarse_infer_token        = params.coarse_infer_token;
+
+    float temp = params.temp;
+
+    float stc_ratio = coarse_rate_hz / semantic_rate_hz * n_coarse_codebooks;
+
+    int max_semantic_history = floorf(max_coarse_history / stc_ratio);
+
+    int n_steps = floorf(input.size() * stc_ratio / n_coarse_codebooks) * n_coarse_codebooks;
+    assert(n_steps > 0);
+    assert(n_steps % n_coarse_codebooks == 0);
+
+    BarkProgressBar progress(std::string("Generating coarse tokens"), n_steps);
+
+    int n_window_steps = ceilf(static_cast<float>(n_steps) / sliding_window_size);
+
+    int step_idx = 0;
+
+    for (int i = 0; i < n_window_steps; i++) {
+        int semantic_idx = roundf(n_steps / stc_ratio);
+
+        bark_sequence input_in(
+            input.begin() + std::max(semantic_idx - max_semantic_history, 0),
+            input.end()
+        );
+
+        size_t original_size = input_in.size();
+        input_in.resize(256);
+
+        // padding from the right side
+        for (int ix = original_size; ix < 256; ix++) {
+            input_in[ix] = coarse_semantic_pad_token;
+        }
+        input_in.push_back(coarse_infer_token);
+
+        // concatenate input_in and input_coarse
+        input_in.insert(
+            input_in.end(),
+            std::make_move_iterator(out.end() - std::min(max_coarse_history, (int) out.size())),
+            std::make_move_iterator(out.end())
+        );
+
+        int n_past = 0;
+
+        for (int j = 0; j < sliding_window_size; j++) {
+            if (step_idx >= n_steps) {
+                continue;
+            }
+
+            if (!bark_eval_encoder_internal(model, allocr, input_in, logits, &n_past, false, n_threads)) {
+                fprintf(stderr, "%s: Could not generate token\n", __func__);
+                return false;
+            }
+
+            input_in.clear();
+
+            bool is_major = step_idx % n_coarse_codebooks == 0;
+            int start_idx = semantic_vocab_size + (1 - is_major) * codebook_size;
+            int end_idx   = semantic_vocab_size + (2 - is_major) * codebook_size;
+
+            std::vector<float> relevant_logits(
+                    logits.begin() + start_idx,
+                    logits.begin() + end_idx
+            );
+
+            bark_token next = gpt_sample(
+                relevant_logits, bctx->rng, temp, NULL, &model.t_sample_us, &model.n_sample);
+
+            next += start_idx;
+
+            input_in.push_back(next);
+            out.push_back(next);
+
+            step_idx += 1;
+
+            progress.update(1);
+            progress.print();
+        }
+    }
+
+    assert((int) out.size() == n_steps);
+    assert(out.size() % n_coarse_codebooks == 0);
+
+    // out_coarse: [seq_length, n_codes]
+    for (int i = 0; i < (int) out.size(); i += n_coarse_codebooks) {
+        // this assumes N_COARSE_CODEBOOKS = 2
+        bark_sequence _tmp = {
+            out[i] - semantic_vocab_size,
+            out[i+1] - semantic_vocab_size - codebook_size
+        };
+        out_coarse.push_back(_tmp);
+    }
+
+    bctx->coarse_tokens = out_coarse;
+
+    return true;
+}
+
+static bool bark_eval_fine_encoder(struct bark_context * bctx, int n_threads) {
+    // input shape: [N, n_codes]
+    bark_codes input = bctx->coarse_tokens;
+
+    std::vector<float> logits;
+    logits.resize(1024*1056);
+
+    auto & model   = bctx->model.fine_model;
+    auto & hparams = model.hparams;
+    auto & params  = bctx->params;
+
+    float temp = params.fine_temp;
+
+    int32_t n_coarse_codebooks = params.n_coarse_codebooks;
+    int32_t n_fine_codebooks   = params.n_fine_codebooks;
+    int32_t codebook_size      = params.codebook_size;
+
+    int n_coarse          = input[0].size();
+    int original_seq_len  = input.size();
+    int n_remove_from_end = 0;
+
+    // channel padding
+    for (int i = 0; i < (int) input.size(); i++) {
+        for (int j = n_coarse_codebooks; j < n_fine_codebooks; j++) {
+            input[i].push_back(codebook_size);
+        }
+    }
+
+    // spatial padding if sequence is too short
+    if (original_seq_len < 1024) {
+        n_remove_from_end = 1024 - original_seq_len;
+        for (int i = original_seq_len; i < 1024; i++) {
+            bark_sequence _tmp(n_fine_codebooks, codebook_size);
+            input.push_back(_tmp);
+        }
+    }
+
+    int n_loops = std::max(0, (int) ceilf((input.size() - 1024) / 512.f)) + 1;
+
+    bark_codes in_arr = input;  // [seq_length, n_codes]
+
+    BarkProgressBar progress(std::string("Generating fine tokens"), n_loops * (n_fine_codebooks - n_coarse));
+
+    for (int n = 0; n < n_loops; n++) {
+        int start_idx          = std::min(n * 512, (int) in_arr.size() - 1024);
+        int start_fill_idx     = std::min(n * 512, (int) in_arr.size() - 512);
+        int rel_start_fill_idx = start_fill_idx - start_idx;
+
+        // in_buffer: [n_codes*seq_length] (sequences are contiguous)
+        bark_sequence in_buffer;
+        for (int i = 0; i < n_fine_codebooks; i++) {
+            for (int j = start_idx; j < start_idx + 1024; j++) {
+                in_buffer.push_back(in_arr[j][i]);
+            }
+        }
+
+        for (int nn = n_coarse; nn < n_fine_codebooks; nn++) {
+            if (!bark_eval_fine_encoder_internal(bctx, in_buffer, logits, nn, n_threads)) {
+                fprintf(stderr, "%s: Could not generate token\n", __func__);
+                return false;
+            }
+
+            for (int i = 0; i < 1024; i++) {
+                std::vector<float> relevant_logits(
+                    logits.begin() +       i * 1056,
+                    logits.begin() + (i + 1) * 1056
+                );
+                relevant_logits.resize(codebook_size);
+
+                bark_token next = gpt_sample(
+                    relevant_logits, bctx->rng, temp, NULL, &model.t_sample_us,
+                    &model.n_sample);
+
+                in_buffer[nn * 1024 + rel_start_fill_idx + i] = next;
+            }
+
+            progress.update(1);
+            progress.print();
+        }
+
+        // transfer over info into model_in
+        for (int nn = n_coarse; nn < n_fine_codebooks; nn++) {
+            for (int j = 0; j < codebook_size - rel_start_fill_idx; j++) {
+                in_arr[start_fill_idx+j][nn] = in_buffer[nn * 1024 + rel_start_fill_idx + j];
+            }
+        }
+    }
+
+    if (n_remove_from_end > 0) {
+        in_arr.resize(in_arr.size() - n_remove_from_end);
+    }
+
+    assert(bctx->coarse_tokens.size() == in_arr.size());
+
+    bctx->fine_tokens = in_arr;
+
+    return true;
+}
+
+bool bark_forward_text_encoder(
+                    struct bark_context * bctx,
+                                    int   n_threads,
+                   bark_verbosity_level   verbosity) {
+    const int64_t t_main_start_us = ggml_time_us();
+
+    auto & model  = bctx->model.text_model;
+    auto & allocr = bctx->allocr;
+    auto & hparams = model.hparams;
+
+    // allocate the compute buffer
+    {
+        // alignment required by the backend
+        size_t align = ggml_backend_get_alignment(model.backend);
+        bctx->allocr = ggml_allocr_new_measure(align);
+
+        // create the worst-case graph for memory usage estimation
+        int n_past = 0;
+        std::vector<bark_vocab::id> decoy_tokens(256+256+1, 0);
+        struct ggml_cgraph * gf = bark_build_gpt_graph(
+            &model, allocr, decoy_tokens, &n_past, true /* merge_ctx */, n_threads);
+
+        // compute the required memory
+        size_t mem_size = ggml_allocr_alloc_graph(bctx->allocr, gf);
+
+        // recreate the allocator with the required memory
+        ggml_allocr_free(bctx->allocr);
+        bctx->buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
+        bctx->allocr = ggml_allocr_new_from_buffer(bctx->buf_compute);
+
+        if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+            fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size/1024.0/1024.0);
+        }
+    }
+
+    if (!bark_eval_text_encoder(bctx, n_threads)) {
+        fprintf(stderr, "%s: failed to forward text encoder\n", __func__);
+        return false;
+    }
+
+    model.t_main_us = ggml_time_us() - t_main_start_us;
+
+    bark_print_statistics(&model);
+
+    ggml_backend_buffer_free(bctx->buf_compute);
+    ggml_allocr_free(bctx->allocr);
+
+    return true;
+}
+
+bool bark_forward_coarse_encoder(
+                        struct bark_context * bctx,
+                                        int   n_threads,
+                       bark_verbosity_level   verbosity) {
+    const int64_t t_main_start_us = ggml_time_us();
+
+    auto & model  = bctx->model.coarse_model;
+    auto & allocr = bctx->allocr;
+    auto & hparams = model.hparams;
+
+    // allocate the compute buffer
+    {
+        // alignment required by the backend
+        size_t align = ggml_backend_get_alignment(model.backend);
+        bctx->allocr = ggml_allocr_new_measure(align);
+
+        // create the worst-case graph for memory usage estimation
+        int n_past = 0;
+        std::vector<bark_vocab::id> decoy_tokens(hparams.block_size, 0);
+        struct ggml_cgraph * gf = bark_build_gpt_graph(
+            &model, allocr, decoy_tokens, &n_past, false /* merge_ctx */, n_threads);
+
+        // compute the required memory
+        size_t mem_size = ggml_allocr_alloc_graph(bctx->allocr, gf);
+
+        // recreate the allocator with the required memory
+        ggml_allocr_free(bctx->allocr);
+        bctx->buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
+        bctx->allocr = ggml_allocr_new_from_buffer(bctx->buf_compute);
+
+        if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+            fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size/1024.0/1024.0);
+        }
+    }
+
+    if (!bark_eval_coarse_encoder(bctx, n_threads)) {
+        fprintf(stderr, "%s: failed to forward coarse encoder\n", __func__);
+        return false;
+    }
+
+    model.t_main_us = ggml_time_us() - t_main_start_us;
+
+    bark_print_statistics(&model);
+
+    ggml_backend_buffer_free(bctx->buf_compute);
+    ggml_allocr_free(bctx->allocr);
+
+    return true;
+}
+
+bool bark_forward_fine_encoder(
+                    struct bark_context * bctx,
+                                    int   n_threads,
+                   bark_verbosity_level   verbosity) {
+    const int64_t t_main_start_us = ggml_time_us();
+
+    auto & model   = bctx->model.fine_model;
+    auto & allocr  = bctx->allocr;
+    auto & hparams = model.hparams;
+    auto & params  = bctx->params;
+
+    int32_t n_fine_codebooks = params.n_fine_codebooks;
+
+    // allocate the compute buffer
+    {
+        // alignment required by the backend
+        size_t align = ggml_backend_get_alignment(model.backend);
+        bctx->allocr = ggml_allocr_new_measure(align);
+
+        // create the worst-case graph for memory usage estimation
+        std::vector<bark_vocab::id> decoy_tokens(hparams.block_size*n_fine_codebooks, 0);
+        struct ggml_cgraph * gf = bark_build_fine_gpt_graph(
+            &model, allocr, decoy_tokens, 2 /* codebook_idx */, n_fine_codebooks, n_threads);
+
+        // compute the required memory
+        size_t mem_size = ggml_allocr_alloc_graph(bctx->allocr, gf);
+
+        // recreate the allocator with the required memory
+        ggml_allocr_free(bctx->allocr);
+        bctx->buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
+        bctx->allocr = ggml_allocr_new_from_buffer(bctx->buf_compute);
+
+        if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+            fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size/1024.0/1024.0);
+        }
+    }
+
+    if (!bark_eval_fine_encoder(bctx, n_threads)) {
+        fprintf(stderr, "%s: failed to forward coarse encoder\n", __func__);
+        return false;
+    }
+
+    model.t_main_us = ggml_time_us() - t_main_start_us;
+
+    bark_print_statistics(&model);
+
+    ggml_backend_buffer_free(bctx->buf_compute);
+    ggml_allocr_free(bctx->allocr);
+
+    return true;
+}
+
+static bool bark_forward_eval(
+        struct bark_context * bctx,
+                        int   n_threads,
+       bark_verbosity_level   verbosity) {
+    if (!bark_forward_text_encoder(bctx, n_threads, verbosity)) {
+        fprintf(stderr, "%s: failed to forward text encoder\n", __func__);
+        return false;
+    }
+
+    if (!bark_forward_coarse_encoder(bctx, n_threads, verbosity)) {
+        fprintf(stderr, "%s: failed to forward coarse encoder\n", __func__);
+        return false;
+    }
+
+    if (!bark_forward_fine_encoder(bctx, n_threads, verbosity)) {
+        fprintf(stderr, "%s: failed to forward fine encoder\n", __func__);
+        return false;
+    }
+
+    return true;
+}
+
+bool bark_generate_audio(
+        struct bark_context * bctx,
+                std::string & text,
+                std::string & dest_wav_path,
+                        int   n_threads,
+       bark_verbosity_level   verbosity) {
+    if (!bctx) {
+        fprintf(stderr, "%s: invalid bark context\n", __func__);
+        return false;
+    }
+
+    int64_t t_start_eval_us = ggml_time_us();
+
+    bark_tokenize_input(bctx, text);
+
+    if (!bark_forward_eval(bctx, n_threads, verbosity)) {
+        fprintf(stderr, "%s: failed to forward eval\n", __func__);
+        return false;
+    }
+
+    // Calling Encodec API to generate audio wavefrom from tokens
+    const int n_gpu_layers = bctx->n_gpu_layers;
+    const std::string encodec_model_path = bctx->encodec_model_path;
+
+    struct encodec_context * ectx = encodec_load_model(
+        encodec_model_path, n_gpu_layers, encodec_verbosity_level::LOW);
+    if (!ectx) {
+        printf("%s: error during loading encodec model\n", __func__);
+        return false;
+    }
+
+    auto & params = bctx->params;
+
+    int32_t target_bandwidth = params.target_bandwidth;
+    int32_t sample_rate      = params.sample_rate;
+
+    encodec_set_target_bandwidth(ectx, target_bandwidth);
+    encodec_set_sample_rate(ectx, sample_rate);
+
+    // current shape fine_tokens: [seq_length][n_channels], n_channels are contiguous
+    // encodec expects shape fine_tokens: [n_channels][seq_length], time steps are contiguous
+    std::vector<bark_vocab::id> encodec_tokens;
+
+    // copy fine_tokens into encodec_tokens by transposing to abide by encodec's shape
+    for (int i = 0; i < (int) bctx->fine_tokens[0].size(); i++) {
+        for (int j = 0; j < (int) bctx->fine_tokens.size(); j++) {
+            encodec_tokens.push_back(bctx->fine_tokens[j][i]);
+        }
+    }
+
+    if (!encodec_decompress_audio(ectx, encodec_tokens, n_threads)) {
+        printf("%s: Could not generate waveform from tokens with Encodec\n", __func__);
+        return false;
+    }
+
+    bctx->audio_arr = ectx->out_audio;
+
+    encodec_free(ectx);
+
+    bctx->t_eval_us = ggml_time_us() - t_start_eval_us;
+
+    return true;
+}
+
+static void bark_free_model(struct gpt_model * model) {
+    if (!model) {
+        return;
+    }
+
+    if(model->ctx) {
+        ggml_free(model->ctx);
+    }
+
+    ggml_backend_buffer_free(model->buffer_w);
+    ggml_backend_free(model->backend);
+}
+
+void bark_free(struct bark_context * bctx) {
+    if (!bctx) {
+        return;
+    }
+
+    bark_free_model(&bctx->model.text_model);
+    bark_free_model(&bctx->model.coarse_model);
+    bark_free_model(&bctx->model.fine_model);
+
+    delete bctx;
+}
+
+static struct bark_model * bark_load_model_from_file(
+                         const std::string & dirname,
+                         struct bark_model * model,
+                                       int   n_gpu_layers,
+                      bark_verbosity_level   verbosity) {
+    if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+        printf("%s: loading model from '%s'\n", __func__, dirname.c_str());
+    }
+
+    // text
+    {
+        if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+            printf("%s: reading bark text model\n", __func__);
+        }
+
+        const std::string fname = std::string(dirname) + "/ggml_weights_text.bin";
+        if (!gpt_load_model_weights(fname, model->text_model, n_gpu_layers, verbosity)) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str());
+            return nullptr;
+        }
+    }
+
+    // vocab
+    {
+        if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+            printf("%s: reading bark vocab\n", __func__);
+        }
+
+        const std::string fname     = std::string(dirname) + "/ggml_vocab.bin";
+        const gpt_hparams hparams   = model->text_model.hparams;
+        const int32_t expected_size = hparams.n_in_vocab - hparams.n_out_vocab - 5;
+
+        if (!bark_vocab_load(fname, &model->vocab, expected_size)) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str());
+            return nullptr;
+        }
+    }
+
+    // coarse
+    {
+        if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+            printf("\n%s: reading bark coarse model\n", __func__);
+        }
+
+        const std::string fname = std::string(dirname) + "/ggml_weights_coarse.bin";
+
+        if (!gpt_load_model_weights(fname, model->coarse_model, n_gpu_layers, verbosity)) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad coarse)\n", __func__, fname.c_str());
+            return nullptr;
+        }
+    }
+
+    // fine
+    {
+        if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) {
+            printf("\n%s: reading bark fine model\n", __func__);
+        }
+
+        const std::string fname = std::string(dirname) + "/ggml_weights_fine.bin";
+
+        if (!gpt_load_model_weights(fname, model->fine_model, n_gpu_layers, verbosity)) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad fine)\n", __func__, fname.c_str());
+            return nullptr;
+        }
+    }
+
+    printf("\n");
+
+    return model;
+}
+
+struct bark_context_params bark_context_default_params() {
+    struct bark_context_params result = {
+        /*.seed                        =*/ 0,
+        /*.temp                        =*/ 0.7,
+        /*.fine_temp                   =*/ 0.5,
+        /*.min_eos_p                   =*/ 0.2,
+        /*.sliding_window_size         =*/ 60,
+        /*.max_coarse_history          =*/ 630,
+        /*.sample_rate                 =*/ 24000,
+        /*.target_bandwidth            =*/ 12,
+        /*.cls_token_id                =*/ 101,
+        /*.sep_token_id                =*/ 102,
+        /*.n_steps_text_encoder        =*/ 768,
+        /*.text_pad_token              =*/ 129595,
+        /*.text_encoding_offset        =*/ 10048,
+        /*.semantic_rate_hz            =*/ 49.9f,
+        /*.semantic_pad_token          =*/ 10000,
+        /*.semantic_vocab_size         =*/ 10000,
+        /*.semantic_infer_token        =*/ 129599,
+        /*.coarse_rate_hz              =*/ 75.0f,
+        /*.coarse_infer_token          =*/ 12050,
+        /*.coarse_semantic_pad_token   =*/ 12048,
+        /*.n_coarse_codebooks          =*/ 2,
+        /*.n_fine_codebooks            =*/ 8,
+        /*.codebook_size               =*/ 1024,
+    };
+
+    return result;
+}
+
+struct bark_context * bark_load_model(
+                    const std::string & model_path,
+                 bark_verbosity_level   verbosity) {
+    int64_t t_load_start_us = ggml_time_us();
+
+    struct bark_context * bctx = new bark_context();
+
+    bctx->model = bark_model();
+    if (!bark_load_model_from_file(model_path, &bctx->model, bctx->n_gpu_layers, verbosity)) {
+        fprintf(stderr, "%s: failed to load model weights from '%s'\n", __func__, model_path.c_str());
+        return {};
+    }
+
+    bark_context_params params = bark_context_default_params();
+    bctx->rng = std::mt19937(params.seed);
+
+    bctx->params = params;
+
+    bctx->t_load_us = ggml_time_us() - t_load_start_us;
+
+    return bctx;
+}
+
+bool bark_model_quantize(
+        const std::string & fname_inp,
+        const std::string & fname_out,
+               ggml_ftype   ftype) {
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    gpt_model model;
+
+    auto fin = std::ifstream(fname_inp, std::ios::binary);
+    if (!fin) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *) &magic, sizeof(magic));
+        if (magic != GGML_FILE_MAGIC) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *) &magic, sizeof(magic));
+    }
+
+    gpt_hparams hparams;
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        read_safe(fin, hparams.n_layer);
+        read_safe(fin, hparams.n_head);
+        read_safe(fin, hparams.n_embd);
+        read_safe(fin, hparams.block_size);
+        read_safe(fin, hparams.bias);
+        read_safe(fin, hparams.n_in_vocab);
+        read_safe(fin, hparams.n_out_vocab);
+        read_safe(fin, hparams.n_lm_heads);
+        read_safe(fin, hparams.n_wtes);
+        read_safe(fin, hparams.ftype);
+
+        const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_in_vocab  = %d\n", __func__, hparams.n_in_vocab);
+        printf("%s: n_out_vocab = %d\n", __func__, hparams.n_out_vocab);
+        printf("%s: block_size  = %d\n", __func__, hparams.block_size);
+        printf("%s: bias        = %d\n", __func__, hparams.bias);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: n_lm_heads  = %d\n", __func__, hparams.n_lm_heads);
+        printf("%s: n_wtes      = %d\n", __func__, hparams.n_wtes);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+        write_safe(fout, hparams.n_layer);
+        write_safe(fout, hparams.n_head);
+        write_safe(fout, hparams.n_embd);
+        write_safe(fout, hparams.block_size);
+        write_safe(fout, hparams.bias);
+        write_safe(fout, hparams.n_in_vocab);
+        write_safe(fout, hparams.n_out_vocab);
+        write_safe(fout, hparams.n_lm_heads);
+        write_safe(fout, hparams.n_wtes);
+        write_safe(fout, ftype_dst);
+    }
+
+    // regexes of tensor names to be quantized
+    const std::vector<std::string> to_quant = {
+        "model/wte/.*",
+        "model/lm_head/.*",
+        "model/h.*/attn/c_attn/w",
+        "model/h.*/attn/c_proj/w",
+        "model/h.*/mlp/c_fc/w",
+        "model/h.*/mlp/c_proj/w",
+    };
+
+    if (!ggml_quantize_weights(fin, fout, ftype, to_quant, {})) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    fin.close();
+    fout.close();
+
+    return true;
+}
diff --git a/bark/bark.h b/bark/bark.h
new file mode 100644
index 0000000..495a2c9
--- /dev/null
+++ b/bark/bark.h
@@ -0,0 +1,347 @@
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <map>
+#include <random>
+#include <thread>
+#include <vector>
+
+#ifdef BARK_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef BARK_BUILD
+#            define BARK_API __declspec(dllexport)
+#        else
+#            define BARK_API __declspec(dllimport)
+#        endif
+#    else
+#        define BARK_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define BARK_API
+#endif
+
+enum class bark_verbosity_level {
+    LOW = 0,
+    MEDIUM = 1,
+    HIGH = 2,
+};
+
+typedef int32_t bark_token;
+
+typedef std::vector<int32_t> bark_sequence;
+typedef std::vector<std::vector<int32_t>> bark_codes;
+
+struct gpt_hparams {
+    int32_t n_in_vocab;
+    int32_t n_out_vocab;
+    int32_t n_layer;
+    int32_t n_head;
+    int32_t n_embd;
+    int32_t block_size;
+    int32_t n_lm_heads;
+    int32_t n_wtes;
+    int32_t ftype;
+    int32_t bias;
+
+    int32_t n_codes_given = 1;
+};
+
+struct bark_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+};
+
+struct gpt_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    struct ggml_tensor * ln_2_g;
+    struct ggml_tensor * ln_2_b;
+
+    // attention
+    struct ggml_tensor * c_attn_attn_w;
+    struct ggml_tensor * c_attn_attn_b;
+
+    struct ggml_tensor * c_attn_proj_w;
+    struct ggml_tensor * c_attn_proj_b;
+
+    // mlp
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct gpt_model {
+    gpt_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wpe;                       //  position embedding
+    std::vector<struct ggml_tensor *> wtes;         //     token embedding
+    std::vector<struct ggml_tensor *> lm_heads;     // language model head
+
+    std::vector<gpt_layer> layers;
+
+    // key + value memory
+    struct ggml_tensor * memory_k;
+    struct ggml_tensor * memory_v;
+
+    struct ggml_context * ctx;
+
+    ggml_backend_t backend = NULL;
+
+    ggml_backend_buffer_t buffer_w;
+    ggml_backend_buffer_t buffer_kv;
+
+    std::map<std::string, struct ggml_tensor *> tensors;
+
+    //
+    int64_t t_sample_us  = 0;
+    int64_t t_predict_us = 0;
+    int64_t t_main_us    = 0;
+
+    //
+    int64_t n_sample  = 0;
+
+    //
+    int64_t memsize = 0;
+};
+
+struct bark_model {
+    // encoder
+    gpt_model coarse_model;
+    gpt_model   fine_model;
+    gpt_model   text_model;
+
+    // vocab
+    bark_vocab vocab;
+};
+
+struct bark_context_params {
+    // RNG seed
+    uint32_t seed;
+
+    // Temperature for sampling (text and coarse encoders)
+    float temp;
+    // Temperature for sampling (fine encoder)
+    float fine_temp;
+
+    // Minimum probability for EOS token (text encoder)
+    float min_eos_p;
+    // Sliding window size for coarse encoder
+    int32_t sliding_window_size;
+    // Max history for coarse encoder
+    int32_t max_coarse_history;
+
+    // Sample rate
+    int32_t sample_rate;
+    // Target bandwidth
+    int32_t target_bandwidth;
+
+    // CLS token ID
+    int32_t cls_token_id;
+    // SEP token ID
+    int32_t sep_token_id;
+
+    // Maximum number of semantic tokens to generate
+    int32_t n_steps_text_encoder;
+
+    // Text PAD token ID
+    int32_t text_pad_token;
+    // Text encoding offset
+    int32_t text_encoding_offset;
+
+    // Semantic frequency rate
+    float semantic_rate_hz;
+    // Semantic PAD token ID
+    int32_t semantic_pad_token;
+    // Vocabulary size in semantic encoder
+    int32_t semantic_vocab_size;
+    // Semantic infernce token ID
+    int32_t semantic_infer_token;
+
+    // Coarse frequency rate
+    float coarse_rate_hz;
+    // Coarse infer token ID
+    int32_t coarse_infer_token;
+    // Coarse semantic pad token ID
+    int32_t coarse_semantic_pad_token;
+
+    // Number of codebooks in coarse encoder
+    int32_t n_coarse_codebooks;
+    // Number of codebooks in fine encoder
+    int32_t n_fine_codebooks;
+    // Dimension of the codes
+    int32_t codebook_size;
+};
+
+struct bark_context {
+    bark_model model;
+
+    // buffer for model evaluation
+    ggml_backend_buffer_t buf_compute;
+
+    // custom allocator
+    struct ggml_allocr * allocr = NULL;
+    int n_gpu_layers = 0;
+
+    std::mt19937 rng;
+
+    bark_sequence tokens;
+    bark_sequence semantic_tokens;
+
+    bark_codes coarse_tokens;
+    bark_codes fine_tokens;
+
+    std::vector<float> audio_arr;
+
+    // hyperparameters
+    bark_context_params params;
+
+    // statistics
+    int64_t t_load_us = 0;
+    int64_t t_eval_us = 0;
+
+    // encodec parameters
+    std::string encodec_model_path;
+};
+
+/**
+ * @brief Returns the default parameters for a bark context.
+ *
+ * @return bark_context_params The default parameters for a bark context.
+ */
+BARK_API struct bark_context_params bark_context_default_params(void);
+
+/**
+ * Loads a BARK model from the specified file path with the given parameters.
+ *
+ * @param model_path The directory path of the bark model to load.
+ * @param verbosity  The verbosity level when loading the model.
+ * @return A pointer to the loaded bark model context.
+ */
+BARK_API struct bark_context * bark_load_model(
+           const std::string & model_path,
+        bark_verbosity_level   verbosity);
+
+/**
+ * Generates an audio file from the given text using the specified Bark context.
+ *
+ * @param bctx The Bark context to use for generating the audio.
+ * @param text The text to generate audio from.
+ * @param dest_wav_path The path to save the generated audio file.
+ * @param n_threads The number of threads to use for generating the audio.
+ * @param verbosity The verbosity level when generating the audio.
+ * @return An integer indicating the success of the audio generation process.
+ */
+BARK_API bool bark_generate_audio(
+                bark_context * bctx,
+                 std::string & text,
+                 std::string & dest_wav_path,
+                         int   n_threads,
+        bark_verbosity_level   verbosity);
+
+/**
+ * Quantizes a bark model and saves the result to a file.
+ *
+ * @param fname_inp The name of the input file containing the BARK model.
+ * @param fname_out The name of the output file to save the quantized model to.
+ * @param ftype The type of the model's floating-point values.
+ * @return True if the model was successfully quantized and saved, false otherwise.
+ */
+BARK_API bool bark_model_quantize(
+           const std::string & fname_inp,
+           const std::string & fname_out,
+                  ggml_ftype   ftype);
+
+/**
+ * @brief Frees the memory allocated for a bark context.
+ *
+ * @param bctx The bark context to free.
+ */
+BARK_API void bark_free(
+        struct bark_context * bctx);
+
+/**
+ * Loads a vocabulary from a file.
+ *
+ * @param fname The name of the file to load the vocabulary from.
+ * @param vocab A pointer to the bark_vocab struct to store the loaded vocabulary in.
+ * @param expected_size The expected size of the vocabulary.
+ * @return true if the vocabulary was loaded successfully, false otherwise.
+ */
+bool bark_vocab_load(
+     const std::string & fname,
+            bark_vocab * vocab,
+               int32_t   expected_size);
+
+/**
+ * Tokenizes the input text using the provided vocabulary.
+ *
+ * @param vocab Pointer to the vocabulary to use for tokenization.
+ * @param text The input text to tokenize.
+ * @param tokens Pointer to an array where the resulting tokens will be stored.
+ * @param n_tokens Pointer to an integer where the number of resulting tokens will be stored.
+ * @param n_max_tokens The maximum number of tokens that can be stored in the tokens array.
+ */
+void bert_tokenize(
+        const bark_vocab * vocab,
+              const char * text,
+                 int32_t * tokens,
+                 int32_t * n_tokens,
+                 int32_t   n_max_tokens);
+
+/**
+ * Encodes the input text using the forward algorithm.
+ *
+ * @param bctx A pointer to the bark context struct.
+ * @param n_threads The number of threads to use for encoding.
+ * @param verbosity The verbosity level when encoding.
+ * @return Returns true if the encoding was successful, false otherwise.
+ */
+bool bark_forward_text_encoder(
+     struct bark_context * bctx,
+                     int   n_threads,
+    bark_verbosity_level   verbosity);
+
+/**
+ * \brief Encodes the input data using the coarse encoder in the bark library.
+ *
+ * This function encodes the input data using the coarse encoder in the bark library.
+ * It takes a bark_context structure pointer, the number of threads to use, and the verbosity level as parameters.
+ *
+ * \param bctx The bark_context structure pointer.
+ * \param n_threads The number of threads to use for encoding.
+ * \param verbosity The verbosity level for logging.
+ * \return Returns true if the encoding is successful, false otherwise.
+ */
+bool bark_forward_coarse_encoder(
+                        struct bark_context * bctx,
+                                        int   n_threads,
+                       bark_verbosity_level   verbosity);
+
+/**
+ * @brief Performs forward fine encoding using the specified bark context.
+ *
+ * This function encodes the input data using the bark context provided. It performs
+ * the encoding operation in parallel using the specified number of threads. The
+ * verbosity level can be used to control the amount of logging information printed
+ * during the encoding process.
+ *
+ * @param bctx The bark context used for encoding.
+ * @param n_threads The number of threads to use for parallel encoding.
+ * @param verbosity The verbosity level for logging information.
+ * @return True if the encoding operation was successful, false otherwise.
+ */
+bool bark_forward_fine_encoder(
+                    struct bark_context * bctx,
+                                    int   n_threads,
+                   bark_verbosity_level   verbosity);
diff --git a/convert.py b/bark/convert.py
similarity index 68%
rename from convert.py
rename to bark/convert.py
index 6162aae..fd3f613 100644
--- a/convert.py
+++ b/bark/convert.py
@@ -11,21 +11,11 @@
     - Name                    (char[name_length])
     - Data                    (float[n_dims])
 
-Note
-----
-Encodec uses weight normalization for its convolutional layers. All the weights are
-decomposed into two tensors called with the suffixes _weight_v and _weight_g. A simple
-call to the hook torch._weight_norm allows to get the final weight tensor of the
-convolution from weight_v and weight_g. To drastically reduce the number of operations
-at inference time, the ggml weights file only contain the final convolution weights but
-does not store the decomposition into weight_v and weight_g.
-
 Example
 -------
 ```bash
     python convert.py \
         --dir-model ~/.cache/suno/bark_v0 \
-        --codec-path ~/Documents/encodec.cpp/ggml_weights \
         --vocab-path ./ggml_weights/ \
         --out-dir ./ggml_weights/ \
         --use-f16
@@ -42,77 +32,21 @@
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--dir-model", type=str, required=True)
-parser.add_argument("--codec-path", type=str, required=True)
 parser.add_argument("--vocab-path", type=str, required=True)
 parser.add_argument("--out-dir", type=str, required=True)
 parser.add_argument("--use-f16", action="store_true")
 
 
-def parse_codec_model(checkpoint, out_dir):
-    """Load encodec model checkpoint."""
-    outfile = open(out_dir, "wb")
-    outfile.write(struct.pack("i", 0x67676d6c))  # ggml magic
-
-    for name in checkpoint.keys():
-        if "encoder." in name:
-            # bark only uses Encodec's quantizer and decoder.
-            continue
-
-        if "weight_g" in name:
-            # the tensor has already been parsed with the corresponding "weight_v"
-            # tensor to form the final weights tensor of the convolution, therefore
-            # we skip it
-            continue
-
-        if "inited" in name or "cluster_size" in name or "embed_avg" in name:
-            # "inited", "cluster_size" and "embed_avg" tensors in quantizer are not used
-            # for the forward pass
-            continue
-
-        var_data = checkpoint[name]
-
-        if not "weight_v" in name:
-            # if conv kernel, do not squeeze because 3d tensor
-            var_data = var_data.numpy().squeeze()
-        else:
-            # weight_v has its corresponding magnitude tensor to rescale the weights
-            # of the convolutional layers. We parse both kinds of weights jointly to
-            # build the final weight tensor of the convolution.
-            base_name = name.split(".")[:-1]
-            weight_g_name = ".".join(base_name + ["weight_g"])
-            var_data_g = checkpoint[weight_g_name]
-
-            final_var_data = torch._weight_norm(var_data, var_data_g, dim=0)
-            var_data = final_var_data.numpy()
-
-            name = ".".join(base_name + ["weight"])
-
-        print(f"Processing variable: {name} with shape: {var_data.shape}")
-
-        if var_data.dtype != np.float32:
-            print("  Converting to float32")
-            var_data = var_data.astype(np.float32)
-
-        n_dims = len(var_data.shape)
-        encoded_name = name.encode("utf-8")
-        ftype = 0  # float32
-        outfile.write(struct.pack("iii", n_dims, len(encoded_name), ftype))
-
-        for i in range(n_dims):
-            outfile.write(struct.pack("i", var_data.shape[n_dims - 1 - i]))
-        outfile.write(encoded_name)
-
-        var_data.tofile(outfile)
-
-    outfile.close()
-
-def parse_hparams(hparams, outfile, use_f16):
+def parse_hparams(hparams, outfile, use_f16, overwrite_bias):
     """Parse GPT hyperparameters."""
     outfile.write(struct.pack("i", hparams["n_layer"]))
     outfile.write(struct.pack("i", hparams["n_head"]))
     outfile.write(struct.pack("i", hparams["n_embd"]))
     outfile.write(struct.pack("i", hparams["block_size"]))
 
+    bias = 1 if overwrite_bias else hparams["bias"]
+    outfile.write(struct.pack("i", int(bias)))
+
     try:
         outfile.write(struct.pack("ii", hparams["vocab_size"], hparams["vocab_size"]))
     except KeyError:
@@ -127,7 +61,7 @@ def parse_hparams(hparams, outfile, use_f16):
         n_wtes = hparams["n_codes_total"]
     except KeyError:
         n_lm_heads, n_wtes = 1, 1
-    
+
     ftype = int(use_f16)
 
     outfile.write(struct.pack("iii", n_lm_heads, n_wtes, ftype))
@@ -140,12 +74,6 @@ def parse_text_models(checkpoint, outfile, use_f16):
 
         n_dims = len(var_data.shape)
 
-        # ftype_cur = 0
-        # if var_data.dtype != np.float32:
-        #     print("  Converting to float32")
-        #     var_data = var_data.astype(np.float32)
-        #     ftype_cur = 0
-
         # strip `_orig_mod.transformer.` prefix
         if name == "_orig_mod.lm_head.weight":
             name = "lm_head.weight"
@@ -233,12 +161,12 @@ def parse_text_models(checkpoint, outfile, use_f16):
 
         var_data.tofile(outfile)
 
-def generate_file(in_file, out_dir, use_f16):
+def generate_file(in_file, out_dir, use_f16, overwrite_bias=False):
     with open(out_dir, "wb") as fout:
         fout.write(struct.pack("i", 0x67676d6c))  # ggml magic
 
         checkpoint = torch.load(in_file, map_location="cpu")
-        parse_hparams(checkpoint["model_args"], fout, use_f16)
+        parse_hparams(checkpoint["model_args"], fout, use_f16, overwrite_bias)
         parse_text_models(checkpoint["model"], fout, use_f16)
 
 def generate_vocab_file(dir_model, out_dir):
@@ -262,7 +190,6 @@ def generate_vocab_file(dir_model, out_dir):
     args = parser.parse_args()
 
     dir_model = Path(args.dir_model)
-    codec_path = Path(args.codec_path)
     vocab_path = Path(args.vocab_path)
 
     out_dir = Path(args.out_dir)
@@ -277,11 +204,9 @@ def generate_vocab_file(dir_model, out_dir):
     generate_file(dir_model / "coarse_2.pt", out_dir / "ggml_weights_coarse.bin", args.use_f16)
     print(" Coarse model loaded.")
 
-    generate_file(dir_model / "fine_2.pt", out_dir / "ggml_weights_fine.bin", args.use_f16)
+    # overwrite_bias set to True since the fine model has biases and current config file
+    # has bias set to False
+    generate_file(dir_model / "fine_2.pt", out_dir / "ggml_weights_fine.bin", args.use_f16, overwrite_bias=True)
     print(" Fine model loaded.")
 
-    codec_chkpt = torch.load(codec_path / "encodec_24khz-d7cc33bc.th", map_location="cpu")
-    parse_codec_model(codec_chkpt, out_dir / "ggml_weights_codec.bin")
-    print(" Codec model loaded.")
-
     print("Done.")
diff --git a/download_weights.py b/bark/download_weights.py
similarity index 100%
rename from download_weights.py
rename to bark/download_weights.py
diff --git a/bark/examples/CMakeLists.txt b/bark/examples/CMakeLists.txt
new file mode 100644
index 0000000..4a09e81
--- /dev/null
+++ b/bark/examples/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_library(common STATIC common.cpp)
+target_include_directories(common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_compile_features(common PRIVATE cxx_std_11)
+
+add_subdirectory(main)
+add_subdirectory(server)
+add_subdirectory(quantize)
\ No newline at end of file
diff --git a/bark/examples/common.cpp b/bark/examples/common.cpp
new file mode 100644
index 0000000..b8cad4f
--- /dev/null
+++ b/bark/examples/common.cpp
@@ -0,0 +1,69 @@
+#include <iostream>
+#include <string> 
+#include <vector>
+
+#define DR_WAV_IMPLEMENTATION
+#include "dr_wav.h"
+
+#include "common.h"
+
+#define SAMPLE_RATE 24000
+
+void write_wav_on_disk(std::vector<float> & audio_arr, std::string dest_path) {
+    drwav_data_format format;
+    format.bitsPerSample = 32;
+    format.sampleRate = SAMPLE_RATE;
+    format.container = drwav_container_riff;
+    format.channels = 1;
+    format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
+
+    drwav wav;
+    drwav_init_file_write(&wav, dest_path.c_str(), &format, NULL);
+    drwav_uint64 frames = drwav_write_pcm_frames(&wav, audio_arr.size(), audio_arr.data());
+    drwav_uninit(&wav);
+
+    fprintf(stderr, "%s: Number of frames written = %lld.\n", __func__, frames);
+}
+
+void bark_print_usage(char ** argv, const bark_params & params) {
+    std::cout << "usage: " << argv[0] << " [options]\n"
+              << "\n"
+              << "options:\n"
+              << "  -h, --help            show this help message and exit\n"
+              << "  -t N, --threads N     number of threads to use during computation (default: " << params.n_threads << ")\n"
+              << "  -s N, --seed N        seed for random number generator (default: " << params.seed << ")\n"
+              << "  -p PROMPT, --prompt PROMPT\n"
+              << "                        prompt to start generation with (default: random)\n"
+              << "  -m FNAME, --model FNAME\n"
+              << "                        model path (default: " << params.model_path << ")\n"
+              << "  -o FNAME, --outwav FNAME\n"
+              << "                        output generated wav (default: " << params.dest_wav_path << ")\n"
+              << "\n";
+}
+
+int bark_params_parse(int argc, char ** argv, bark_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-p" || arg == "--prompt") {
+            params.prompt = argv[++i];
+        } else if (arg == "-m" || arg == "--model") {
+            params.model_path = argv[++i];
+        } else if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-o" || arg == "--outwav") {
+            params.dest_wav_path = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            bark_print_usage(argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            bark_print_usage(argv, params);
+            exit(0);
+        }
+    }
+
+    return 0;
+}
diff --git a/bark/examples/common.h b/bark/examples/common.h
new file mode 100644
index 0000000..e347e55
--- /dev/null
+++ b/bark/examples/common.h
@@ -0,0 +1,48 @@
+#include <string>
+#include <thread>
+#include <vector>
+
+struct bark_params {
+    // Number of threads used for audio generation.
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+
+    // User prompt.
+    std::string prompt = "This is an audio generated by bark.cpp";
+
+    // Location of model weights.
+    std::string model_path = "./ggml_weights";
+
+    // Destination path for generated WAV file.
+    std::string dest_wav_path = "output.wav";
+
+    // Seed for reproducibility in token sampling.
+    int32_t seed = 0;
+};
+
+/**
+ * @brief Writes a WAV file from disk and stores the audio data in a vector of floats.
+ *
+ * @param in_path Path to the input WAV file.
+ * @param audio_arr Vector to store the audio data.
+ * @return true If the file was successfully read.
+ * @return false If the file could not be read.
+ */
+void write_wav_on_disk(std::vector<float> & audio_arr, std::string dest_path);
+
+/**
+ * @brief Parses command line arguments and stores them in a bark_params struct.
+ *
+ * @param argc The number of command line arguments.
+ * @param argv An array of C-strings containing the command line arguments.
+ * @param params A reference to a bark_params struct where the parsed arguments will be stored.
+ * @return int Returns 0 if the parsing was successful, otherwise returns a non-zero value.
+ */
+int bark_params_parse(int argc, char ** argv, bark_params & params);
+
+/**
+ * Prints the usage information for the bark command-line tool.
+ *
+ * @param argv The command-line arguments passed to the program.
+ * @param params The parameters used by the bark command-line tool.
+ */
+void bark_print_usage(char ** argv, const bark_params & params);
diff --git a/dr_wav.h b/bark/examples/dr_wav.h
similarity index 100%
rename from dr_wav.h
rename to bark/examples/dr_wav.h
diff --git a/examples/main/CMakeLists.txt b/bark/examples/main/CMakeLists.txt
similarity index 64%
rename from examples/main/CMakeLists.txt
rename to bark/examples/main/CMakeLists.txt
index fd8855f..3d35bf7 100644
--- a/examples/main/CMakeLists.txt
+++ b/bark/examples/main/CMakeLists.txt
@@ -1,9 +1,6 @@
 set(TARGET main)
-
 add_executable(${TARGET} main.cpp)
-
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE bark.cpp ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE bark common)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 
 if(MSVC)
diff --git a/bark/examples/main/main.cpp b/bark/examples/main/main.cpp
new file mode 100644
index 0000000..6357bf0
--- /dev/null
+++ b/bark/examples/main/main.cpp
@@ -0,0 +1,60 @@
+#include <iostream>
+#include <tuple>
+
+#include "ggml.h"
+#include "bark.h"
+#include "common.h"
+
+
+int main(int argc, char **argv) {
+    ggml_time_init();
+    const int64_t t_main_start_us = ggml_time_us();
+
+    bark_params params;
+    bark_verbosity_level verbosity = bark_verbosity_level::LOW;
+
+    if (bark_params_parse(argc, argv, params) > 0) {
+        fprintf(stderr, "%s: Could not parse arguments\n", __func__);
+        return 1;
+    }
+
+    std::cout << R"(    __               __                          )" << "\n"
+              << R"(   / /_  ____ ______/ /__        _________  ____ )" << "\n"
+              << R"(  / __ \/ __ `/ ___/ //_/       / ___/ __ \/ __ \)" << "\n"
+              << R"( / /_/ / /_/ / /  / ,<    _    / /__/ /_/ / /_/ /)" << "\n"
+              << R"(/_.___/\__,_/_/  /_/|_|  (_)   \___/ .___/ .___/ )" << "\n"
+              << R"(                                  /_/   /_/      )" << "\n";
+
+    // initialize bark context
+    struct bark_context * bctx = bark_load_model(params.model_path, verbosity);
+    if (!bctx) {
+        fprintf(stderr, "%s: Could not load model\n", __func__);
+        exit(1);
+    }
+
+    // TODO: for now, hardcoding the Encodec model path
+    bctx->encodec_model_path = "/Users/pbannier/Documents/encodec.cpp/ggml_weights/ggml-model.bin";
+
+    // generate audio
+    if (!bark_generate_audio(bctx, params.prompt, params.dest_wav_path, params.n_threads, verbosity)) {
+        fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
+        exit(1);
+    }
+
+    auto & audio_arr = bctx->audio_arr;
+    write_wav_on_disk(audio_arr, params.dest_wav_path);
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n\n");
+        printf("%s:     load time = %8.2f ms\n", __func__, bctx->t_load_us/1000.0f);
+        printf("%s:     eval time = %8.2f ms\n", __func__, bctx->t_eval_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    bark_free(bctx);
+
+    return 0;
+}
diff --git a/bark/examples/quantize/CMakeLists.txt b/bark/examples/quantize/CMakeLists.txt
new file mode 100644
index 0000000..c406ab1
--- /dev/null
+++ b/bark/examples/quantize/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(TARGET quantize)
+add_executable(${TARGET} main.cpp)
+target_link_libraries(${TARGET} PRIVATE bark)
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/quantize/quantize.cpp b/bark/examples/quantize/main.cpp
similarity index 95%
rename from examples/quantize/quantize.cpp
rename to bark/examples/quantize/main.cpp
index bbfb403..3139f90 100644
--- a/examples/quantize/quantize.cpp
+++ b/bark/examples/quantize/main.cpp
@@ -1,4 +1,4 @@
-/*This script quantizes the weights of the 3 GPT encoders. 5 quantization types are
+/* This script quantizes the weights of the 3 GPT encoders. 5 quantization types are
 available:
     - q4_0
     - q4_1
@@ -7,14 +7,15 @@
     - q8_0
 
 Usage:
+```bash
     ./quantize \
         ./ggml_weights/ggml_weights_text.bin \
         ./ggml_weights_q4/ggml_weights_text_quant.bin \
-        type
+        q4_0
+```
 */
 #include "ggml.h"
 #include "bark.h"
-#include "bark-util.h"
 
 #include <cassert>
 #include <cmath>
diff --git a/examples/server/CMakeLists.txt b/bark/examples/server/CMakeLists.txt
similarity index 59%
rename from examples/server/CMakeLists.txt
rename to bark/examples/server/CMakeLists.txt
index 71dba1b..b6a6f8b 100644
--- a/examples/server/CMakeLists.txt
+++ b/bark/examples/server/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET server)
 add_executable(${TARGET} server.cpp httplib.h json.hpp)
 
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE bark.cpp ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE bark ${CMAKE_THREAD_LIBS_INIT})
diff --git a/examples/server/httplib.h b/bark/examples/server/httplib.h
similarity index 100%
rename from examples/server/httplib.h
rename to bark/examples/server/httplib.h
diff --git a/examples/server/json.hpp b/bark/examples/server/json.hpp
similarity index 100%
rename from examples/server/json.hpp
rename to bark/examples/server/json.hpp
diff --git a/examples/server/server.cpp b/bark/examples/server/server.cpp
similarity index 85%
rename from examples/server/server.cpp
rename to bark/examples/server/server.cpp
index b724784..d9a1c45 100644
--- a/examples/server/server.cpp
+++ b/bark/examples/server/server.cpp
@@ -96,23 +96,13 @@ int main(int argc, char ** argv) {
 
     bark_params_parse(argc, argv, params);
 
-    // create model
-    bark_model * model = bark_load_model_from_file(params.model_path.c_str());
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model_path.c_str());
+    struct bark_context * bctx = bark_load_model(params.model_path.c_str(), bark_verbosity_level::LOW);
+    if (!bctx) {
+        fprintf(stderr, "%s: Could not load model\n", __func__);
         return 1;
     }
 
-    // create params
-    bark_context_params bctx_params = bark_context_default_params();
-    bark_context * bctx = bark_new_context_with_model(model, bctx_params);
-    if (bctx == NULL) {
-        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model_path.c_str());
-        bark_free_model(model);
-        return 1;
-    }
-
-    bark_seed_rng(bctx, params.seed);
+    // bark_seed_rng(bctx, params.seed);
 
     std::mutex bark_mutex;
 
@@ -135,13 +125,13 @@ int main(int argc, char ** argv) {
         std::string text = jreq.at("text");
 
         // generate audio
-        bark_generate_audio(bctx, text.c_str(), "/tmp/bark_tmp.wav", params.n_threads);
+        std::string dest_wav_path = "/tmp/bark_tmp.wav";
+        bark_generate_audio(bctx, text, dest_wav_path, params.n_threads, bark_verbosity_level::LOW);
 
         // read audio as binary
         std::ifstream wav_file("/tmp/bark_tmp.wav", std::ios::binary);
 
-        if (wav_file.is_open())
-        {
+        if (wav_file.is_open()) {
             // Read the contents of the WAV file
             std::string wav_contents((std::istreambuf_iterator<char>(wav_file)),
                                      std::istreambuf_iterator<char>());
@@ -152,8 +142,7 @@ int main(int argc, char ** argv) {
             // Set the response body to the WAV file contents
             res.set_content(wav_contents, "audio/wav");
         }
-        else
-        {
+        else {
             // If the file cannot be opened, set a 500 Internal Server Error response
             res.status = 500;
             res.set_content("Internal Server Error", "text/plain");
@@ -169,8 +158,7 @@ int main(int argc, char ** argv) {
     svr.set_read_timeout(params.sparams.read_timeout);
     svr.set_write_timeout(params.sparams.write_timeout);
 
-    if (!svr.bind_to_port(params.sparams.hostname, params.sparams.port))
-    {
+    if (!svr.bind_to_port(params.sparams.hostname, params.sparams.port)) {
         fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
                 params.sparams.hostname.c_str(), params.sparams.port);
         return 1;
@@ -183,8 +171,7 @@ int main(int argc, char ** argv) {
     printf("\nbark server listening at http://%s:%d\n\n",
            params.sparams.hostname.c_str(), params.sparams.port);
 
-    if (!svr.listen_after_bind())
-    {
+    if (!svr.listen_after_bind()) {
         return 1;
     }
 
diff --git a/requirements.txt b/bark/requirements.txt
similarity index 100%
rename from requirements.txt
rename to bark/requirements.txt
diff --git a/bark/tests/CMakeLists.txt b/bark/tests/CMakeLists.txt
new file mode 100644
index 0000000..43ca0f3
--- /dev/null
+++ b/bark/tests/CMakeLists.txt
@@ -0,0 +1,26 @@
+add_library(test_utils STATIC common.cpp)
+target_include_directories(test_utils PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_compile_features(test_utils PRIVATE cxx_std_11)
+
+#
+# test-tokenizer
+
+set(TEST_TARGET test-tokenizer)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE bark)
+
+
+#
+# test-coarse-encoder
+
+set(TEST_TARGET test-coarse-encoder)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE bark)
+
+
+#
+# test-fine-encoder
+
+set(TEST_TARGET test-fine-encoder)
+add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE bark)
diff --git a/tests/common.cpp b/bark/tests/common.cpp
similarity index 98%
rename from tests/common.cpp
rename to bark/tests/common.cpp
index 164a471..7fe11ea 100644
--- a/tests/common.cpp
+++ b/bark/tests/common.cpp
@@ -1,12 +1,10 @@
+#include <cmath>
 #include <fstream>
-#include <vector>
 #include <tuple>
+#include <vector>
 
-#include "bark-util.h"
 #include "common.h"
 
-#define BARK_API_INTERNAL
-
 int64_t bytes_left(std::ifstream & f) {
     // utils to check all bytes are read from stream
     int64_t curr_pos = f.tellg();
@@ -16,6 +14,11 @@ int64_t bytes_left(std::ifstream & f) {
     return bytes_left_to_read;
 }
 
+template<typename T>
+static void read_safe(std::ifstream& fin, T& dest) {
+    fin.read((char*)& dest, sizeof(T));
+}
+
 template <typename T, typename U>
 inline bool all_close(
             std::vector<T>   s1,
diff --git a/tests/common.h b/bark/tests/common.h
similarity index 97%
rename from tests/common.h
rename to bark/tests/common.h
index db89b67..6090e56 100644
--- a/tests/common.h
+++ b/bark/tests/common.h
@@ -1,5 +1,4 @@
 #pragma once
-#include "bark.h"
 
 #include <tuple>
 #include <vector>
@@ -9,6 +8,8 @@
 typedef std::vector<float> logit_sequence;
 typedef std::vector<std::vector<float>> logit_matrix;
 
+typedef std::vector<std::vector<int32_t>> bark_codes;
+
 /* Comparison utils */
 template <typename T, typename U>
 inline bool all_equal(std::vector<T> s1, std::vector<U> s2, int * n_violations);
diff --git a/bark/tests/test-coarse-encoder.cpp b/bark/tests/test-coarse-encoder.cpp
new file mode 100644
index 0000000..9918c5a
--- /dev/null
+++ b/bark/tests/test-coarse-encoder.cpp
@@ -0,0 +1,72 @@
+/* Usage:
+
+```bash
+    ./bin/test-coarse-encoder ../ggml_weights/
+```
+*/
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "bark.h"
+
+const int n_threads = 4;
+const bark_verbosity_level verbosity = bark_verbosity_level::MEDIUM;
+
+const bark_sequence semantic_tokens = {
+    1913, 8020, 8572, 8572, 1722, 59, 28, 28, 28, 8606, 7695, 7695, 6948, 9488, 92, 28, 107, 9296, 4093, 1640, 1449, 50, 1079, 441, 10, 41, 8275, 847, 8396, 8396, 6747, 7656, 2049, 7656, 5156, 5156, 8865, 178, 50, 178, 1015, 441, 10, 41, 3451, 5737, 2563, 3354, 4382, 734, 4683, 827, 396, 50, 10, 27, 27, 8093, 7401, 937, 937, 937, 259, 2066, 4485, 1385, 1385, 4, 4, 1385, 7588, 660, 252, 252, 252, 663, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96  // this is a dog barking.
+};
+const bark_sequence semantic_tokens_2 = {
+    10, 5785, 10, 6043, 6043, 6043, 6043, 6043, 6043, 4019, 8137, 4166, 5832, 7803, 8010, 8010, 8010, 6174, 6174, 741, 741, 6592, 741, 441, 10, 783, 206, 206, 206, 10, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 344 , 10, 65, 344, 147, 55, 10, 57, 57, 7882, 6863, 6863, 4298, 9111, 9111 , 5862, 5862, 5862, 3741, 657, 120, 171, 2895, 741, 6750, 6750, 10, 2330, 2795 , 2795, 5131, 5131, 2415, 2415, 2130, 880, 8634, 59, 28, 28, 28, 28, 28 , 28, 28, 28, 28, 1133, 5586, 8607, 6799, 4156, 4156, 1177, 326, 326, 741 , 232, 100, 6401, 3670, 5899, 8266, 8266, 4825, 8522, 4323, 7298, 7298, 26, 26 , 26, 1262, 3705, 985, 6844, 441, 441, 5555, 10, 2690, 8428, 10, 985, 7010 , 147, 147, 2907, 2907, 59, 28, 28, 28, 28, 107, 1310, 8968, 8968, 9366 , 9366, 1732, 1732, 1732, 373, 9263, 4480, 4480, 4480, 4480, 4236, 1285, 1285, 1285 , 1285, 1285, 1285, 1285, 1285, 1285, 2997, 2997, 6662, 3761, 3761, 1003, 9293, 83 , 83, 20, 2881, 4978, 5457, 602, 147, 5457, 5457, 5457, 10, 7309, 147, 147 , 147, 217, 8934, 9046, 9510, 9510, 9510, 956, 956, 2320, 2320, 7283, 3088, 3212 , 1152, 3212, 122, 59, 28, 28, 28, 107, 107, 28, 107, 107, 223, 223 , 223, 223, 2784, 206, 230, 206, 1710, 602, 10, 5092, 9862, 10, 55, 206 , 193, 147, 193, 206, 206, 4374, 206, 206, 517, 206, 206, 10, 1278, 1278 , 2089, 147, 147, 10, 57, 604, 7882, 6863, 6863, 4298, 9111, 9111, 5862, 5862 , 9516, 3741, 3599, 120, 120, 1443, 8627, 7274, 1025, 10, 10, 6356, 1878, 8485 , 6703, 8922, 5951, 3506, 2237, 9218, 9218, 4977, 1697, 1697, 3599, 232, 1606, 1620 , 10, 99, 401, 6236, 3573, 9090, 9090, 9090, 298, 128, 5794, 8099, 7610, 389 , 9944, 823, 9456, 9456, 4238, 4238, 3645, 288, 120, 298, 5546, 2921, 2921, 6076 , 3937, 4909, 3937, 6501, 6501, 441, 10, 245, 3623, 3493, 2846, 9056, 9056, 3361 , 1112, 2180, 741, 211, 211, 10, 402, 10, 8934, 2673, 2389, 2389, 4382, 734 , 734, 4683, 9935, 5771, 7901, 232, 232, 10, 27, 27, 3971, 4089, 8844, 6750 , 441, 441, 10, 100, 6200, 3158, 8396, 8396, 8396, 2069, 557, 557, 7901, 741 , 256, 2430, 59, 28, 28, 28, 107, 7883, 6027, 3182, 3182, 3755, 208, 208 , 2462, 232, 10, 6401, 4747, 9818, 7557, 7557, 7557, 208, 208, 5327, 2462, 441 , 10, 10, 41, 4942, 8022, 8022, 8726, 6664, 8726, 8522, 3767, 3767, 3767, 4775 , 6133, 281, 3374, 8376, 8376, 3374, 8376, 441, 8376, 763, 5092, 10, 56, 230 , 56, 230, 147, 206, 206, 206, 206, 5199, 206, 206, 206, 206, 206, 206 , 206, 206, 206, 206, 206, 147, 3252, 206, 91, 2966, 55, 1278, 147, 147 , 147, 55, 3961, 147, 147, 147, 302, 6356, 6513, 6513, 6513, 6513, 6513, 6513 , 6513, 421, 421, 4925, 4925, 4925, 4925, 4925, 7813, 7813, 7813, 1430, 8634, 8811 , 59, 59, 28, 28, 107, 6467, 9569, 5920, 9124, 9124, 5481, 5481, 2507, 2507 , 9921, 422, 215, 215, 215, 6123, 6123, 5916, 5916, 5916, 8184, 4698, 7900, 7900 , 7900, 7900, 664, 749, 278, 749, 749, 10, 5457, 602, 5457, 5457, 147, 55 , 7309, 10, 2330, 3540, 8772, 1430, 1430, 985, 441, 10, 1532, 2384, 8536, 5187 , 8869, 6105, 6105, 6105, 5313, 1471, 1471, 9935, 3561, 1242, 232, 100, 10, 27 , 4168, 4168, 4286, 8634, 8634, 207, 28, 28, 28, 254, 9569, 9569, 5920, 9124 , 5481, 5481, 2507, 2507, 441, 10, 329, 195, 1136, 1136, 3619, 5131, 5131, 1662 , 2415, 741, 10, 5026, 6043, 6043, 9662, 9662, 9002, 7857, 7857, 4786, 4786, 4323 , 4323, 26, 26, 26, 26, 2451, 2451, 10, 10, 266, 206, 206, 206, 206 , 206, 206, 206, 206, 206, 206, 206, 65, 206, 206, 206, 65, 344, 55 , 344, 147, 147, 10, 57, 302, 2201, 2201, 2201, 2201, 5411, 5411, 4554, 7714 , 7714, 2580, 1025, 1025, 1025, 7710, 1973, 1973, 535, 321
+};
+
+std::vector<std::vector<int> > transpose(const std::vector<std::vector<int> > data) {
+    // this assumes that all inner vectors have the same size and
+    // allocates space for the complete result in advance
+    std::vector<std::vector<int> > result(data[0].size(),
+                                          std::vector<int>(data.size()));
+    for (std::vector<int>::size_type i = 0; i < data[0].size(); i++)
+        for (std::vector<int>::size_type j = 0; j < data.size(); j++) {
+            result[i][j] = data[j][i];
+        }
+    return result;
+}
+
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <model-file>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string weights_dir = argv[1];
+
+    // initialize bark context
+    struct bark_context * bctx = bark_load_model(weights_dir.c_str(), verbosity);
+    if (!bctx) {
+        fprintf(stderr, "%s: Could not load model\n", __func__);
+        exit(1);
+    }
+
+    bctx->semantic_tokens = semantic_tokens_2;
+
+    // generate coarse tokens
+    if (!bark_forward_coarse_encoder(bctx, n_threads, verbosity)) {
+        fprintf(stderr, "%s: failed to forward coarse encoder\n", __func__);
+        return 1;
+    }
+
+    // print coarse tokens
+    fprintf(stderr, "shape of coarse tokens: [%zu, %zu]\n", bctx->coarse_tokens.size(), bctx->coarse_tokens[0].size());
+
+    bark_codes ct = transpose(bctx->coarse_tokens);
+
+    for (size_t i = 0; i < ct.size(); i++) {
+        for (size_t j = 0; j < ct[i].size(); j++) {
+            fprintf(stderr, "%d ", ct[i][j]);
+        }
+        fprintf(stderr, "\n");
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/bark/tests/test-fine-encoder.cpp b/bark/tests/test-fine-encoder.cpp
new file mode 100644
index 0000000..78a2abf
--- /dev/null
+++ b/bark/tests/test-fine-encoder.cpp
@@ -0,0 +1,70 @@
+/* Usage:
+
+```bash
+    ./bin/test-fine-encoder ../ggml_weights/
+```
+*/
+#include <cstdio>
+#include <string>
+#include <vector>
+
+#include "bark.h"
+
+const int n_threads = 4;
+const bark_verbosity_level verbosity = bark_verbosity_level::MEDIUM;
+
+const bark_codes coarse_tokens = {
+    { 395, 395, 395, 395, 475, 395, 475, 395, 395, 395, 395, 395, 819, 395, 395, 395, 395, 395, 395, 819, 819, 395, 395, 395, 395, 395, 395, 395, 395, 395, 537, 887, 537, 499, 835, 475, 404, 475, 395, 475, 855, 257, 475, 404, 779, 779, 395, 395, 23, 59, 881, 59, 901, 151, 860, 819, 819, 819, 373, 819, 819, 635, 1011, 373, 798, 819, 373, 819, 709, 819, 819, 819, 635, 323, 192, 901, 59, 942, 871, 208, 430, 604, 834, 430, 475, 475, 395, 475, 537, 233, 747, 428, 683, 112, 402, 216, 683, 112, 402, 216, 216, 99, 683, 112, 402, 216, 216, 683, 112, 428, 428, 690, 942, 871, 208, 228, 904, 404, 404, 499, 404, 475, 395, 475, 257, 835, 475, 475, 475, 395, 475, 257, 475, 475, 855, 887, 392, 216, 683, 112, 112, 402, 11, 11, 11, 323, 91, 904, 404, 855, 404, 779, 677, 475, 59, 59, 151, 276, 23, 276, 276, 347, 347, 879, 753, 325, 879, 1011, 753, 276, 276, 753, 276, 228, 855, 835, 475, 475, 475, 475, 106, 475, 395, 537, 835, 257, 404, 835, 475, 887, 475, 475, 475, 855, 475, 475, 475, 475, 475, 475, 475, 475, 475, 475 },
+    { 969, 928, 928, 913, 928, 43, 424, 913, 518, 200, 200, 544, 544, 200, 200, 200, 424, 200, 424, 544, 969, 200, 964, 200, 913, 969, 544, 200, 200, 544, 646, 200, 913, 648, 969, 518, 544, 424, 913, 518, 424, 544, 913, 424, 544, 913, 913, 544, 73, 504, 591, 952, 591, 655, 1007, 429, 603, 857, 4, 857, 896, 1010, 504, 35, 955, 67, 4, 1010, 857, 857, 857, 857, 961, 964, 381, 955, 952, 955, 386, 403, 601, 961, 765, 544, 913, 424, 765, 424, 928, 453, 403, 505, 833, 478, 478, 478, 478, 478, 478, 478, 478, 95, 478, 478, 478, 478, 478, 478, 95, 663, 136, 386, 386, 891, 770, 896, 516, 937, 544, 747, 928, 969, 913, 424, 363, 424, 424, 424, 424, 646, 913, 544, 928, 424, 544, 463, 478, 185, 776, 300, 685, 685, 371, 663, 513, 105, 1007, 770, 1007, 969, 544, 964, 648, 519, 717, 591, 833, 364, 364, 105, 364, 770, 200, 364, 519, 519, 519, 519, 519, 745, 942, 519, 829, 928, 859, 937, 913, 424, 544, 424, 424, 518, 200, 648, 928, 544, 544, 424, 424, 646, 913, 424, 913, 544, 913, 913, 913, 518, 928, 913, 913, 913, 913, 518},
+};
+
+std::vector<std::vector<int> > transpose(const std::vector<std::vector<int> > data) {
+    // this assumes that all inner vectors have the same size and
+    // allocates space for the complete result in advance
+    std::vector<std::vector<int> > result(data[0].size(),
+                                          std::vector<int>(data.size()));
+    for (std::vector<int>::size_type i = 0; i < data[0].size(); i++)
+        for (std::vector<int>::size_type j = 0; j < data.size(); j++) {
+            result[i][j] = data[j][i];
+        }
+    return result;
+}
+
+int main(int argc, char **argv) {
+    if (argc < 2) {
+        fprintf(stderr, "Usage: %s <model-file>\n", argv[0]);
+        return 1;
+    }
+
+    const std::string weights_dir = argv[1];
+
+    // initialize bark context
+    struct bark_context * bctx = bark_load_model(weights_dir.c_str(), verbosity);
+    if (!bctx) {
+        fprintf(stderr, "%s: Could not load model\n", __func__);
+        exit(1);
+    }
+
+    bctx->coarse_tokens = transpose(coarse_tokens);
+
+    // generate fine tokens
+    if (!bark_forward_fine_encoder(bctx, n_threads, verbosity)) {
+        fprintf(stderr, "%s: failed to forward fine encoder\n", __func__);
+        return 1;
+    }
+
+    // print fine tokens
+    fprintf(stderr, "shape of fine tokens: [%zu, %zu]\n", bctx->fine_tokens.size(), bctx->fine_tokens[0].size());
+
+    bark_codes ft = transpose(bctx->fine_tokens);
+    // bark_codes ft = bctx->fine_tokens;
+
+    for (size_t i = 0; i < ft.size(); i++) {
+        for (size_t j = 0; j < ft[i].size(); j++) {
+            fprintf(stderr, "%d ", ft[i][j]);
+        }
+        fprintf(stderr, "\n");
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/tests/test-tokenizer.cpp b/bark/tests/test-tokenizer.cpp
similarity index 89%
rename from tests/test-tokenizer.cpp
rename to bark/tests/test-tokenizer.cpp
index 7fb8973..0ab00f5 100644
--- a/tests/test-tokenizer.cpp
+++ b/bark/tests/test-tokenizer.cpp
@@ -1,19 +1,16 @@
+/* Usage:
+
+```bash
+    ./bin/test-tokenizer ../ggml_weights/ggml_vocab.bin
+```
+*/
 #include <cstdio>
 #include <string>
 #include <map>
 #include <vector>
 
-#define BARK_API_INTERNAL 
 #include "bark.h"
 
-struct bark_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-};
-
 static const std::map<std::string, bark_sequence> & k_tests()
 {
     static std::map<std::string, bark_sequence> _k_tests = {
@@ -39,9 +36,9 @@ int main(int argc, char **argv) {
     bark_vocab vocab;
     int max_ctx_size = 256;
 
-    if (bark_vocab_load(fname.c_str(), &vocab, 119547) > 0) {
+    if (!bark_vocab_load(fname.c_str(), &vocab, 119547)) {
         fprintf(stderr, "%s: invalid vocab file '%s'\n", __func__, fname.c_str());
-        return 1;
+        exit(1);
     }
 
     for (const auto & test_kv : k_tests()) {
diff --git a/encodec.cpp b/encodec.cpp
deleted file mode 100644
index eee3730..0000000
--- a/encodec.cpp
+++ /dev/null
@@ -1,535 +0,0 @@
-#include "encodec.h"
-#include "ggml.h"
-#include "bark-util.h"
-
-#include <cmath>
-#include <stdexcept>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-static void encodec_sigmoid_impl(struct ggml_tensor * dst, const struct ggml_tensor * src, int ith, int nth, void * userdata) {
-    GGML_ASSERT(userdata == NULL);
-    GGML_ASSERT(ggml_are_same_shape(dst, src));
-    GGML_ASSERT(ggml_is_contiguous(dst));
-    GGML_ASSERT(ggml_is_contiguous(src));
-
-    const float * src_data = ggml_get_data_f32(src);
-    float * dst_data = ggml_get_data_f32(dst);
-
-    const int ne = (int)ggml_nelements(dst);
-    const int dr = (ne + nth - 1) / nth;
-    const int ie0 = dr * ith;
-    const int ie1 = std::min(ie0 + dr, ne);
-
-    for (int i = ie0; i < ie1; ++i) {
-        dst_data[i] = 1.0f / (1.0f + expf(-src_data[i]));
-    }
-}
-
-static struct ggml_tensor * encodec_sigmoid(ggml_context * ctx, struct ggml_tensor * x) {
-    return ggml_map_custom1(ctx, x, encodec_sigmoid_impl, GGML_N_TASKS_MAX, NULL);
-}
-
-static int get_extra_padding_for_conv_1d(ggml_tensor * inp, float kernel_size, float stride, float padding_total) {
-    float length = inp->ne[0];
-    float n_frames = (length - kernel_size + padding_total) / stride + 1.0f;
-    int ideal_length = (ceilf(n_frames) - 1) * stride + (kernel_size - padding_total);
-    return ideal_length - length;
-}
-
-static struct ggml_tensor * pad_1d(ggml_context * ctx0, ggml_tensor * inp, int padding_left, int padding_right) {
-    int length = inp->ne[0];
-    int dim = inp->ne[1];
-
-    const int max_pad = std::max(padding_left, padding_right);
-    int extra_pad = 0;
-
-    if (length <= max_pad) {
-        extra_pad = max_pad - length + 1;
-
-        // constant padding
-        struct ggml_tensor * out = ggml_new_tensor_2d(ctx0, inp->type, length+extra_pad, dim);
-        ggml_set_zero(out);
-        out = ggml_set_2d(ctx0, out, inp, out->nb[1], 0);
-    }
-
-    struct ggml_tensor * padded = ggml_pad_reflec_1d(ctx0, inp, padding_left, padding_right);
-
-    const int end = padded->ne[0] - extra_pad;
-    struct ggml_tensor *dest = ggml_view_2d(ctx0, padded, end, dim, padded->nb[1], 0);
-
-    return dest;
-}
-
-static struct ggml_tensor * unpad_1d(ggml_context * ctx0, ggml_tensor * inp, int padding_left, int padding_right) {
-    int length = inp->ne[0];
-    int dim    = inp->ne[1];
-
-    ENCODEC_ASSERT(padding_left  >= 0);
-    ENCODEC_ASSERT(padding_right >= 0);
-    ENCODEC_ASSERT(padding_left + padding_right <= length);
-
-    int end = length - padding_right;
-
-    int offset = padding_left * inp->nb[1];
-    struct ggml_tensor * dst = ggml_view_2d(ctx0, inp, end, dim, inp->nb[1], offset);
-
-    return dst;
-}
-
-static struct ggml_tensor * forward_pass_lstm_unilayer(
-            struct ggml_context * ctx0,
-            struct ggml_tensor * inp,
-            struct ggml_tensor * weight_ih,
-            struct ggml_tensor * weight_hh,
-            struct ggml_tensor * bias_ih,
-            struct ggml_tensor * bias_hh) {
-
-    const int input_dim  = inp->ne[1];
-    const int hidden_dim = weight_ih->ne[1]/4;
-    const int seq_length = inp->ne[0];
-
-    struct ggml_tensor * hs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_dim, seq_length);
-
-    struct ggml_tensor * c_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim);
-    struct ggml_tensor * h_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim);
-
-    h_t = ggml_set_zero(h_t);
-    c_t = ggml_set_zero(c_t);
-
-    struct ggml_tensor * current = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
-
-    for (int t = 0; t < seq_length; t++) {
-        struct ggml_tensor * x_t = ggml_view_1d(ctx0, current, input_dim, t*current->nb[1]);
-
-        struct ggml_tensor * inp_gates = ggml_mul_mat(ctx0, weight_ih, x_t);
-        inp_gates = ggml_add(ctx0, inp_gates, bias_ih);
-
-        struct ggml_tensor * hid_gates = ggml_mul_mat(ctx0, weight_hh, h_t);
-        hid_gates = ggml_add(ctx0, hid_gates, bias_hh);
-
-        struct ggml_tensor * out_gates = ggml_add(ctx0, inp_gates, hid_gates);
-
-        struct ggml_tensor * i_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 0*sizeof(float)*hidden_dim));
-        struct ggml_tensor * f_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 1*sizeof(float)*hidden_dim));
-        struct ggml_tensor * g_t = ggml_tanh   (ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 2*sizeof(float)*hidden_dim));
-        struct ggml_tensor * o_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 3*sizeof(float)*hidden_dim));
-
-        c_t = ggml_add(ctx0, ggml_mul(ctx0, f_t, c_t), ggml_mul(ctx0, i_t, g_t));
-        h_t = ggml_mul(ctx0, o_t, ggml_tanh(ctx0, c_t));
-
-        hs = ggml_set_1d(ctx0, hs, h_t, t*hs->nb[1]);
-    }
-
-    hs = ggml_cont(ctx0, ggml_transpose(ctx0, hs));
-
-    return hs;
-}
-
-static struct ggml_tensor * strided_conv_1d(
-            ggml_context * ctx0,
-             ggml_tensor * inp,
-             ggml_tensor * conv_w,
-             ggml_tensor * conv_b,
-                     int   stride) {
-    int kernel_size   = conv_w->ne[0];
-    int padding_total = kernel_size - stride;
-    int extra_padding = get_extra_padding_for_conv_1d(inp, kernel_size, stride, padding_total);
-
-    struct ggml_tensor * padded_inp = pad_1d(ctx0, inp, padding_total, extra_padding);
-    struct ggml_tensor * dst = ggml_conv_1d(ctx0, conv_w, padded_inp, stride, 0, 1);
-
-    // add bias
-    dst = ggml_transpose(ctx0, dst);
-    dst = ggml_add(ctx0, ggml_repeat(ctx0, conv_b, dst), dst);
-    dst = ggml_cont(ctx0, ggml_transpose(ctx0, dst));
-
-    return dst;
-}
-
-static struct ggml_tensor * strided_conv_transpose_1d(
-                ggml_context * ctx0,
-                ggml_tensor * inp,
-                ggml_tensor * conv_w,
-                ggml_tensor * conv_b,
-                        int   stride) {
-    int kernel_size   = conv_w->ne[0];
-    int padding_total = kernel_size - stride;
-
-    struct ggml_tensor * dst = ggml_conv_transpose_1d(ctx0, conv_w, inp, stride, 0, 1);
-
-    // add bias
-    dst = ggml_transpose(ctx0, dst);
-    dst = ggml_add(ctx0, ggml_repeat(ctx0, conv_b, dst), dst);
-    dst = ggml_cont(ctx0, ggml_transpose(ctx0, dst));
-
-    int padding_right = ceilf(padding_total);
-    int padding_left = padding_total - padding_right;
-
-    struct ggml_tensor * unpadded = unpad_1d(ctx0, dst, padding_left, padding_right);
-    unpadded = ggml_cont(ctx0, unpadded);
-
-    return unpadded;
-}
-
-int encodec_model_load(const std::string& fname, encodec_model& model) {
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return 1;
-    }
-
-    // verify magic (i.e. ggml signature in hex format)
-    {
-        uint32_t magic;
-        read_safe(fin, magic);
-        if (magic != GGML_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return 1;
-        }
-    }
-
-    auto & ctx = model.ctx;
-    size_t ctx_size = 0;
-
-    // Evaluating context size
-    {
-        const auto & hparams = model.hparams;
-
-        const int in_channels   = hparams.in_channels;
-        const int hidden_dim    = hparams.hidden_dim;
-        const int n_filters     = hparams.n_filters;
-        const int kernel_size   = hparams.kernel_size;
-        const int res_kernel_sz = hparams.residual_kernel_size;
-        const int n_q           = hparams.n_q;
-        const int n_bins        = hparams.n_bins;
-        const int *ratios       = hparams.ratios;
-
-        // decoder
-        {
-            // initial conv1d layer
-            ctx_size += in_channels*n_filters*kernel_size*ggml_type_size(GGML_TYPE_F32);  // weight
-            ctx_size +=                         n_filters*ggml_type_size(GGML_TYPE_F32);  //bias
-
-            int mult = 1;  // scaling factor for hidden size
-
-            for (int i = 0; i < 4; i++) {
-                // conv1
-                ctx_size += res_kernel_sz*(mult*n_filters)*(mult*n_filters/2)*ggml_type_size(GGML_TYPE_F32);  // weight
-                ctx_size +=                                (mult*n_filters/2)*ggml_type_size(GGML_TYPE_F32);  // bias
-
-                // conv2
-                ctx_size += (mult*n_filters/2)*(mult*n_filters)*ggml_type_size(GGML_TYPE_F32);
-                ctx_size +=                    (mult*n_filters)*ggml_type_size(GGML_TYPE_F32);
-
-                // shortcut conv
-                ctx_size += (mult*n_filters)*(mult*n_filters)*ggml_type_size(GGML_TYPE_F32);
-                ctx_size +=                  (mult*n_filters)*ggml_type_size(GGML_TYPE_F32);
-
-                // downsampling blocks
-                ctx_size += (2*ratios[i])*(mult*n_filters)*(mult*n_filters*2)*ggml_type_size(GGML_TYPE_F32);
-                ctx_size +=                                (mult*n_filters*2)*ggml_type_size(GGML_TYPE_F32);
-
-                mult *= 2;
-            }
-
-            // lstm
-            {
-                // l0_ih, l0_hh, l1_ih, l1_hh all have the same shapes, hence 4
-                ctx_size += 4*(mult*n_filters)*(4*mult*n_filters)*ggml_type_size(GGML_TYPE_F32);  // weight
-                ctx_size +=                  4*(4*mult*n_filters)*ggml_type_size(GGML_TYPE_F32);  // bias
-            }
-
-            // final conv
-            ctx_size += kernel_size*(mult*n_filters)*hidden_dim*ggml_type_size(GGML_TYPE_F32);
-            ctx_size +=                              hidden_dim*ggml_type_size(GGML_TYPE_F32);
-        }
-
-        // quantizer
-        {
-            ctx_size += n_q*hidden_dim*n_bins; // embed
-        }
-
-        ctx_size += 10ull*MB;  // object overhead
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params = {
-            /* .mem_size   = */   ctx_size,
-            /* .mem_buffer = */   NULL,
-            /* .no_alloc   = */   false,
-        };
-
-        model.ctx = ggml_init(params);
-        if(!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return 1;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int in_channels   = hparams.in_channels;
-        const int hidden_dim    = hparams.hidden_dim;
-        const int n_filters     = hparams.n_filters;
-        const int kernel_size   = hparams.kernel_size;
-        const int res_kernel_sz = hparams.residual_kernel_size;
-        const int n_q           = hparams.n_q;
-        const int *ratios       = hparams.ratios;
-        const int n_bins        = hparams.n_bins;
-
-        // decoder
-        {
-            model.decoder.blocks.resize(4);
-
-            int mult = 16;  // 2**len(ratios)
-
-            model.decoder.init_conv_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, kernel_size, hidden_dim, mult*n_filters);
-            model.decoder.init_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mult*n_filters);
-
-            model.tensors["decoder.model.0.conv.conv.weight"] = model.decoder.init_conv_w;
-            model.tensors["decoder.model.0.conv.conv.bias"]   = model.decoder.init_conv_b;
-
-            // LSTM
-            model.decoder.lstm.l0_ih_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mult*n_filters, 4*mult*n_filters);
-            model.decoder.lstm.l1_ih_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mult*n_filters, 4*mult*n_filters);
-
-            model.tensors["decoder.model.1.lstm.weight_ih_l0"] = model.decoder.lstm.l0_ih_w;
-            model.tensors["decoder.model.1.lstm.weight_ih_l1"] = model.decoder.lstm.l1_ih_w;
-
-            model.decoder.lstm.l0_hh_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mult*n_filters, 4*mult*n_filters);
-            model.decoder.lstm.l1_hh_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mult*n_filters, 4*mult*n_filters);
-
-            model.tensors["decoder.model.1.lstm.weight_hh_l0"] = model.decoder.lstm.l0_hh_w;
-            model.tensors["decoder.model.1.lstm.weight_hh_l1"] = model.decoder.lstm.l1_hh_w;
-
-            model.decoder.lstm.l0_ih_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*mult*n_filters);
-            model.decoder.lstm.l1_ih_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*mult*n_filters);
-
-            model.tensors["decoder.model.1.lstm.bias_ih_l0"] = model.decoder.lstm.l0_ih_b;
-            model.tensors["decoder.model.1.lstm.bias_ih_l1"] = model.decoder.lstm.l1_ih_b;
-
-            model.decoder.lstm.l0_hh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*mult*n_filters);
-            model.decoder.lstm.l1_hh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*mult*n_filters);
-
-            model.tensors["decoder.model.1.lstm.bias_hh_l0"] = model.decoder.lstm.l0_hh_b;
-            model.tensors["decoder.model.1.lstm.bias_hh_l1"] = model.decoder.lstm.l1_hh_b;
-
-            for (int i = 0; i < 4; i++) {
-                // upsampling
-                model.decoder.blocks[i].us_conv_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, ratios[i]*2, mult*n_filters/2, mult*n_filters);
-                model.decoder.blocks[i].us_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mult*n_filters/2);
-
-                model.tensors["decoder.model." + std::to_string(3*(i+1)) + ".convtr.convtr.weight"] = model.decoder.blocks[i].us_conv_w;
-                model.tensors["decoder.model." + std::to_string(3*(i+1)) + ".convtr.convtr.bias"]   = model.decoder.blocks[i].us_conv_b;
-
-                // conv1
-                model.decoder.blocks[i].conv_1_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, res_kernel_sz, mult*n_filters/2, mult*n_filters/4);
-                model.decoder.blocks[i].conv_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mult*n_filters/4);
-
-                model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".block.1.conv.conv.weight"] = model.decoder.blocks[i].conv_1_w;
-                model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".block.1.conv.conv.bias"]     = model.decoder.blocks[i].conv_1_b;
-
-                // conv2
-                model.decoder.blocks[i].conv_2_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, mult*n_filters/4, mult*n_filters/2);
-                model.decoder.blocks[i].conv_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mult*n_filters/2);
-
-                model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".block.3.conv.conv.weight"] = model.decoder.blocks[i].conv_2_w;
-                model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".block.3.conv.conv.bias"]   = model.decoder.blocks[i].conv_2_b;
-
-                // shortcut
-                model.decoder.blocks[i].conv_sc_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, mult*n_filters/2, mult*n_filters/2);
-                model.decoder.blocks[i].conv_sc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mult*n_filters/2);
-
-                model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".shortcut.conv.conv.weight"] = model.decoder.blocks[i].conv_sc_w;
-                model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".shortcut.conv.conv.bias"]   = model.decoder.blocks[i].conv_sc_b;
-
-                mult /= 2;
-            }
-
-            model.decoder.final_conv_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, kernel_size, n_filters, in_channels);
-            model.decoder.final_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
-
-            model.tensors["decoder.model.15.conv.conv.weight"] = model.decoder.final_conv_w;
-            model.tensors["decoder.model.15.conv.conv.bias"]   = model.decoder.final_conv_b;
-        }
-
-        // quantizer
-        {
-            model.quantizer.blocks.resize(n_q);
-            for (int i = 0; i < n_q; i++) {
-                model.quantizer.blocks[i].embed = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_bins);
-                model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.embed"] = model.quantizer.blocks[i].embed;
-            }
-        }
-
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-        model.n_loaded    = 0;
-
-        while(true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ftype;
-
-            read_safe(fin, n_dims);
-            read_safe(fin, length);
-            read_safe(fin, ftype);
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[3] = {1, 1, 1};
-            for (int i = 0; i < n_dims; i++) {
-                read_safe(fin, ne[i]);
-                nelements *= ne[i];
-            }
-
-            std::string name;
-            std::vector<char> buf(length);
-            fin.read(&buf[0], buf.size());
-            name.assign(&buf[0], buf.size());
-
-            if (model.tensors.find(name.data()) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return 1;
-            }
-
-            auto tensor = model.tensors[name.data()];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                return 1;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld, %lld], expected [%d, %d, %d]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
-                return 1;
-            }
-
-            const size_t bpe = ggml_type_size(ggml_type(ftype));
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                return 1;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            // printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-
-            total_size += ggml_nbytes(tensor);
-            model.n_loaded++;
-        }
-
-        fprintf(stderr, "%s: model size    = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return 0;
-}
-
-struct ggml_tensor * encodec_quantizer_decode_eval(
-                    struct ggml_context * ctx0,
-                    const encodec_model & model,
-                    struct ggml_tensor  * codes) {
-    // codes: [seq_length, n_codes]
-    const int hidden_dim = model.hparams.hidden_dim;
-    const int seq_length = codes->ne[0];
-    const int n_q        = codes->ne[1];
-
-    struct ggml_tensor * quantized_out = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_dim, seq_length);
-    quantized_out = ggml_set_zero(quantized_out);
-
-    for (int i = 0; i < n_q; i++) {
-        encodec_quant_block block = model.quantizer.blocks[i];
-
-        struct ggml_tensor * indices   = ggml_view_1d(ctx0, codes, seq_length, i*codes->nb[1]);
-        struct ggml_tensor * quantized = ggml_get_rows(ctx0, block.embed, indices);
-
-        quantized_out = ggml_add(ctx0, quantized_out, quantized);
-    }
-
-    quantized_out = ggml_cont(ctx0, ggml_transpose(ctx0, quantized_out));
-
-    return quantized_out;
-}
-
-struct ggml_tensor * encodec_decoder_eval(
-                    struct ggml_context * ctx0,
-                    const encodec_model & model,
-                    struct ggml_tensor  * quantized_out) {
-    const auto & hparams = model.hparams;
-    const int * ratios   = hparams.ratios;
-    const int stride     = hparams.stride;
-
-    struct ggml_tensor * inpL = strided_conv_1d(
-        ctx0, quantized_out, model.decoder.init_conv_w, model.decoder.init_conv_b, stride);
-
-    // lstm
-    {
-        struct ggml_tensor * cur = inpL;
-
-        const encodec_lstm lstm = model.decoder.lstm;
-
-        // first lstm layer
-        struct ggml_tensor * hs1 = forward_pass_lstm_unilayer(
-            ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b);
-
-        // second lstm layer
-        struct ggml_tensor * out = forward_pass_lstm_unilayer(
-            ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b);
-
-        inpL = ggml_add(ctx0, inpL, out);
-    }
-
-    for (int layer_ix = 0; layer_ix < 4; layer_ix++) {
-        encodec_decoder_block block = model.decoder.blocks[layer_ix];
-
-        // upsampling layers
-        inpL = ggml_elu(ctx0, inpL);
-
-        inpL = strided_conv_transpose_1d(
-            ctx0, inpL, block.us_conv_w, block.us_conv_b, ratios[layer_ix]);
-
-        struct ggml_tensor * current = inpL;
-
-        // shortcut
-        struct ggml_tensor * shortcut = strided_conv_1d(
-            ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride);
-
-        // conv1
-        current = ggml_elu(ctx0, current);
-
-        current = strided_conv_1d(
-            ctx0, current, block.conv_1_w, block.conv_1_b, stride);
-
-        // conv2
-        current = ggml_elu(ctx0, current);
-
-        current = strided_conv_1d(
-            ctx0, current, block.conv_2_w, block.conv_2_b, stride);
-
-        // residual connection
-        inpL = ggml_add(ctx0, current, shortcut);
-    }
-
-    // final conv
-    inpL = ggml_elu(ctx0, inpL);
-
-    struct ggml_tensor * output = strided_conv_1d(
-        ctx0, inpL, model.decoder.final_conv_w, model.decoder.final_conv_b, stride);
-
-    return output;
-}
diff --git a/encodec.cpp b/encodec.cpp
new file mode 160000
index 0000000..e50cd96
--- /dev/null
+++ b/encodec.cpp
@@ -0,0 +1 @@
+Subproject commit e50cd96d28c89f6c1343c291042b14bab6f3b83b
diff --git a/encodec.h b/encodec.h
deleted file mode 100644
index d19af6d..0000000
--- a/encodec.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* This is a shortened version of the original Encodec.CPP here: https://github.com/PABannier/encodec.cpp.
-Only the decoding quantizer and decoder part is implemented in this file.
-*/
-#pragma once
-
-#include "ggml.h"
-
-#include <cmath>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <thread>
-#include <string>
-#include <vector>
-
-#define ENCODEC_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "ENCODEC_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
-
-struct encodec_hparams {
-    int32_t in_channels          = 1;
-    int32_t hidden_dim           = 128;
-    int32_t n_filters            = 32;
-    int32_t ratios[4]            = {8, 5, 4, 2};
-    int32_t kernel_size          = 7;
-    int32_t residual_kernel_size = 3;
-    int32_t compress             = 2;
-    int32_t n_lstm_layers        = 2;
-    int32_t stride               = 1;
-
-    // 24kbps (n_q=32)
-    int32_t n_q                  = 32;
-    int32_t n_bins               = 1024;
-    int32_t sr                   = 24000;
-};
-
-// res + downsample block at some ratio
-struct encodec_encoder_block {
-    // conv1
-    struct ggml_tensor * conv_1_w;
-    struct ggml_tensor * conv_1_b;
-
-    // conv2
-    struct ggml_tensor * conv_2_w;
-    struct ggml_tensor * conv_2_b;
-
-    // shortcut
-    struct ggml_tensor * conv_sc_w;
-    struct ggml_tensor * conv_sc_b;
-
-    // downsampling layers
-    struct ggml_tensor * ds_conv_w;
-    struct ggml_tensor * ds_conv_b;
-};
-
-struct encodec_lstm {
-    struct ggml_tensor * l0_ih_w;
-    struct ggml_tensor * l0_hh_w;
-
-    struct ggml_tensor * l0_ih_b;
-    struct ggml_tensor * l0_hh_b;
-
-    struct ggml_tensor * l1_ih_w;
-    struct ggml_tensor * l1_hh_w;
-
-    struct ggml_tensor * l1_ih_b;
-    struct ggml_tensor * l1_hh_b;
-};
-
-struct encodec_quant_block {
-    struct ggml_tensor * embed;
-};
-
-struct encodec_quantizer {
-    std::vector<encodec_quant_block> blocks;
-};
-
-struct encodec_decoder_block {
-    //upsampling layers
-    struct ggml_tensor * us_conv_w;
-    struct ggml_tensor * us_conv_b;
-
-    // conv1
-    struct ggml_tensor * conv_1_w;
-    struct ggml_tensor * conv_1_b;
-
-    // conv2
-    struct ggml_tensor * conv_2_w;
-    struct ggml_tensor * conv_2_b;
-
-    // shortcut
-    struct ggml_tensor * conv_sc_w;
-    struct ggml_tensor * conv_sc_b;
-};
-
-struct encodec_decoder {
-    struct ggml_tensor * init_conv_w;
-    struct ggml_tensor * init_conv_b;
-
-    encodec_lstm lstm;
-
-    struct ggml_tensor * final_conv_w;
-    struct ggml_tensor * final_conv_b;
-
-    std::vector<encodec_decoder_block> blocks;
-};
-
-struct encodec_model {
-    encodec_hparams hparams;
-
-    encodec_quantizer quantizer;
-    encodec_decoder   decoder;
-
-    // context
-    struct ggml_context * ctx;
-    int n_loaded;
-
-    std::map<std::string, struct ggml_tensor *> tensors;
-
-    int64_t t_predict_us = 0;
-    int64_t t_main_us = 0;
-
-    int64_t memsize = 0;
-    size_t mem_per_token = 0;
-};
-
-
-int encodec_model_load(const std::string& fname, encodec_model& model);
-
-struct ggml_tensor * encodec_quantizer_decode_eval(
-                        struct ggml_context * ctx0,
-                        const encodec_model & model,
-                        struct ggml_tensor  * codes);
-
-struct ggml_tensor * encodec_decoder_eval(
-                        struct ggml_context * ctx0,
-                        const encodec_model & model,
-                        struct ggml_tensor  * quantized_out);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
deleted file mode 100644
index d167621..0000000
--- a/examples/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-add_subdirectory(main)
-add_subdirectory(server)
-add_subdirectory(quantize)
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
deleted file mode 100644
index 7a1dab9..0000000
--- a/examples/main/main.cpp
+++ /dev/null
@@ -1,123 +0,0 @@
-#include "ggml.h"
-#include "bark.h"
-
-#include <tuple>
-
-struct bark_params {
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-
-    // user prompt
-    std::string prompt = "this is an audio";
-
-    // paths
-    std::string model_path = "./ggml_weights";
-    std::string dest_wav_path = "output.wav";
-
-    int32_t seed = 0;
-};
-
-void bark_print_usage(char ** argv, const bark_params & params) {
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h, --help            show this help message and exit\n");
-    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
-    fprintf(stderr, "  -s N, --seed N        seed for random number generator (default: %d)\n", params.seed);
-    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
-    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
-    fprintf(stderr, "  -m FNAME, --model FNAME\n");
-    fprintf(stderr, "                        model path (default: %s)\n", params.model_path.c_str());
-    fprintf(stderr, "  -o FNAME, --outwav FNAME\n");
-    fprintf(stderr, "                        output generated wav (default: %s)\n", params.dest_wav_path.c_str());
-    fprintf(stderr, "\n");
-}
-
-int bark_params_parse(int argc, char ** argv, bark_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-t" || arg == "--threads") {
-            params.n_threads = std::stoi(argv[++i]);
-        } else if (arg == "-p" || arg == "--prompt") {
-            params.prompt = argv[++i];
-        } else if (arg == "-m" || arg == "--model") {
-            params.model_path = argv[++i];
-        } else if (arg == "-s" || arg == "--seed") {
-            params.seed = std::stoi(argv[++i]);
-        } else if (arg == "-o" || arg == "--outwav") {
-            params.dest_wav_path = argv[++i];
-        } else if (arg == "-h" || arg == "--help") {
-            bark_print_usage(argv, params);
-            exit(0);
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            bark_print_usage(argv, params);
-            exit(0);
-        }
-    }
-
-    return 0;
-}
-
-std::tuple<struct bark_model *, struct bark_context *> bark_init_from_params(bark_params & params) {
-    bark_model * model = bark_load_model_from_file(params.model_path.c_str());
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model_path.c_str());
-        return std::make_tuple(nullptr, nullptr);
-    }
-
-    bark_context_params bctx_params = bark_context_default_params();
-    bark_context * bctx = bark_new_context_with_model(model, bctx_params);
-    if (bctx == NULL) {
-        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model_path.c_str());
-        bark_free_model(model);
-        return std::make_tuple(nullptr, nullptr);
-    }
-
-    return std::make_tuple(model, bctx);
-}
-
-int main(int argc, char **argv) {
-    ggml_time_init();
-    const int64_t t_main_start_us = ggml_time_us();
-
-    bark_params params;
-
-    if (bark_params_parse(argc, argv, params) > 0) {
-        fprintf(stderr, "%s: Could not parse arguments\n", __func__);
-        return 1;
-    }
-
-    int64_t t_load_us = 0;
-    int64_t t_eval_us = 0;
-
-    bark_context * bctx;
-    bark_model * model;
-
-    // load the model
-    const int64_t t_start_us = ggml_time_us();
-    std::tie(model, bctx) = bark_init_from_params(params);
-    t_load_us = ggml_time_us() - t_start_us;
-
-    printf("\n");
-
-    bark_seed_rng(bctx, params.seed);
-
-    const int64_t t_eval_us_start = ggml_time_us();
-    bark_generate_audio(bctx, params.prompt.data(), params.dest_wav_path.c_str(), params.n_threads);
-    t_eval_us = ggml_time_us() - t_eval_us_start;
-
-    // report timing
-    {
-        const int64_t t_main_end_us = ggml_time_us();
-
-        printf("\n\n");
-        printf("%s:     load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
-        printf("%s:     eval time = %8.2f ms\n", __func__, t_eval_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
-    }
-
-    bark_free(bctx);
-
-    return 0;
-}
diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt
deleted file mode 100644
index d0cb815..0000000
--- a/examples/quantize/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-set(TARGET quantize)
-
-add_executable(${TARGET} quantize.cpp)
-
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE bark.cpp ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
-
-if(MSVC)
-    target_compile_definitions(${TARGET} PRIVATE -D_CRT_SECURE_NO_WARNINGS=1)
-endif()
\ No newline at end of file
diff --git a/ggml b/ggml
deleted file mode 160000
index a16b01d..0000000
--- a/ggml
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a16b01d6891fd885800988003d53755c9574c6e4
diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake
deleted file mode 100644
index 5023b77..0000000
--- a/scripts/build-info.cmake
+++ /dev/null
@@ -1,53 +0,0 @@
-set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in")
-set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h")
-set(BUILD_NUMBER 0)
-set(BUILD_COMMIT "unknown")
-
-# Look for git
-find_package(Git)
-if(NOT Git_FOUND)
-    execute_process(
-        COMMAND which git
-        OUTPUT_VARIABLE GIT_EXECUTABLE
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-    )
-    if(NOT GIT_EXECUTABLE STREQUAL "")
-        set(Git_FOUND TRUE)
-        message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}")
-    else()
-        message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.")
-    endif()
-endif()
-
-# Get the commit count and hash
-if(Git_FOUND)
-    execute_process(
-        COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE HEAD
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        RESULT_VARIABLE GIT_HEAD_RESULT
-    )
-    execute_process(
-        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        OUTPUT_VARIABLE COUNT
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        RESULT_VARIABLE GIT_COUNT_RESULT
-    )
-    if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0)
-        set(BUILD_COMMIT ${HEAD})
-        set(BUILD_NUMBER ${COUNT})
-    endif()
-endif()
-
-# Only write the header if it's changed to prevent unnecessary recompilation
-if(EXISTS ${HEADER_FILE})
-    file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"")
-    list(GET CONTENTS 0 EXISTING)
-    if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"")
-        configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
-    endif()
-else()
-    configure_file(${TEMPLATE_FILE} ${HEADER_FILE})
-endif()
diff --git a/scripts/build-info.h.in b/scripts/build-info.h.in
deleted file mode 100644
index 75d1e16..0000000
--- a/scripts/build-info.h.in
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef BUILD_INFO_H
-#define BUILD_INFO_H
-
-#define BUILD_NUMBER @BUILD_NUMBER@
-#define BUILD_COMMIT "@BUILD_COMMIT@"
-
-#endif // BUILD_INFO_H
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
deleted file mode 100644
index e386832..0000000
--- a/tests/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-function(bark_add_test source)
-    get_filename_component(TEST_TARGET ${source} NAME_WE)
-    add_executable(${TEST_TARGET} ${source})
-    install(TARGETS ${TEST_TARGET} RUNTIME)
-    target_link_libraries(${TEST_TARGET} PRIVATE bark.cpp ${CMAKE_THREAD_LIBS_INIT})
-    target_compile_features(${TEST_TARGET} PRIVATE cxx_std_11)
-    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
-endfunction()
-
-bark_add_test(test-tokenizer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_vocab.bin)
-# bark_add_test(test-text-encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_text.bin)
-# bark_add_test(test-coarse-encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_coarse.bin)
-# bark_add_test(test-fine-encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_fine.bin)
-# bark_add_test(test-forward-semantic.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_text.bin)
-# bark_add_test(test-forward-coarse.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_text.bin)
-# bark_add_test(test-forward-fine.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_fine.bin)
-# bark_add_test(test-forward-encodec.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_encodec.bin)
diff --git a/tests/data/coarse/test_pass_coarse_1.bin b/tests/data/coarse/test_pass_coarse_1.bin
deleted file mode 100644
index 7612c6c..0000000
Binary files a/tests/data/coarse/test_pass_coarse_1.bin and /dev/null differ
diff --git a/tests/data/coarse/test_pass_coarse_2.bin b/tests/data/coarse/test_pass_coarse_2.bin
deleted file mode 100644
index 268792c..0000000
Binary files a/tests/data/coarse/test_pass_coarse_2.bin and /dev/null differ
diff --git a/tests/data/coarse/test_pass_coarse_3.bin b/tests/data/coarse/test_pass_coarse_3.bin
deleted file mode 100644
index 582382a..0000000
Binary files a/tests/data/coarse/test_pass_coarse_3.bin and /dev/null differ
diff --git a/tests/data/fine/test_pass_fine_1.bin b/tests/data/fine/test_pass_fine_1.bin
deleted file mode 100644
index 7dd2a1a..0000000
Binary files a/tests/data/fine/test_pass_fine_1.bin and /dev/null differ
diff --git a/tests/data/fine/test_pass_fine_2.bin b/tests/data/fine/test_pass_fine_2.bin
deleted file mode 100644
index 9e7c61e..0000000
Binary files a/tests/data/fine/test_pass_fine_2.bin and /dev/null differ
diff --git a/tests/data/fine/test_pass_fine_3.bin b/tests/data/fine/test_pass_fine_3.bin
deleted file mode 100644
index 34fff18..0000000
Binary files a/tests/data/fine/test_pass_fine_3.bin and /dev/null differ
diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_1.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_1.bin
deleted file mode 100644
index a75b57e..0000000
Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_1.bin and /dev/null differ
diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_2.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_2.bin
deleted file mode 100644
index 7bc5904..0000000
Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_2.bin and /dev/null differ
diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_3.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_3.bin
deleted file mode 100644
index b4472b0..0000000
Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_3.bin and /dev/null differ
diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_4.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_4.bin
deleted file mode 100644
index 82026e2..0000000
Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_4.bin and /dev/null differ
diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_5.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_5.bin
deleted file mode 100644
index 38fb523..0000000
Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_5.bin and /dev/null differ
diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_6.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_6.bin
deleted file mode 100644
index 81f38c5..0000000
Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_6.bin and /dev/null differ
diff --git a/tests/data/gpt_eval/test_gpt_eval_1_merge.bin b/tests/data/gpt_eval/test_gpt_eval_1_merge.bin
deleted file mode 100644
index 1a86c5a..0000000
Binary files a/tests/data/gpt_eval/test_gpt_eval_1_merge.bin and /dev/null differ
diff --git a/tests/data/gpt_eval/test_gpt_eval_1_no_merge.bin b/tests/data/gpt_eval/test_gpt_eval_1_no_merge.bin
deleted file mode 100644
index 6349af1..0000000
Binary files a/tests/data/gpt_eval/test_gpt_eval_1_no_merge.bin and /dev/null differ
diff --git a/tests/data/gpt_eval/test_gpt_eval_2_merge.bin b/tests/data/gpt_eval/test_gpt_eval_2_merge.bin
deleted file mode 100644
index 4e954f3..0000000
Binary files a/tests/data/gpt_eval/test_gpt_eval_2_merge.bin and /dev/null differ
diff --git a/tests/data/gpt_eval/test_gpt_eval_2_no_merge.bin b/tests/data/gpt_eval/test_gpt_eval_2_no_merge.bin
deleted file mode 100644
index 7355c98..0000000
Binary files a/tests/data/gpt_eval/test_gpt_eval_2_no_merge.bin and /dev/null differ
diff --git a/tests/data/gpt_eval/test_gpt_eval_3_merge.bin b/tests/data/gpt_eval/test_gpt_eval_3_merge.bin
deleted file mode 100644
index fbed27b..0000000
Binary files a/tests/data/gpt_eval/test_gpt_eval_3_merge.bin and /dev/null differ
diff --git a/tests/data/gpt_eval/test_gpt_eval_3_no_merge.bin b/tests/data/gpt_eval/test_gpt_eval_3_no_merge.bin
deleted file mode 100644
index af57d05..0000000
Binary files a/tests/data/gpt_eval/test_gpt_eval_3_no_merge.bin and /dev/null differ
diff --git a/tests/data/semantic/test_pass_semantic_1.bin b/tests/data/semantic/test_pass_semantic_1.bin
deleted file mode 100644
index c13b44a..0000000
Binary files a/tests/data/semantic/test_pass_semantic_1.bin and /dev/null differ
diff --git a/tests/data/semantic/test_pass_semantic_2.bin b/tests/data/semantic/test_pass_semantic_2.bin
deleted file mode 100644
index 7738dbf..0000000
Binary files a/tests/data/semantic/test_pass_semantic_2.bin and /dev/null differ
diff --git a/tests/data/semantic/test_pass_semantic_3.bin b/tests/data/semantic/test_pass_semantic_3.bin
deleted file mode 100644
index b5bfd9b..0000000
Binary files a/tests/data/semantic/test_pass_semantic_3.bin and /dev/null differ
diff --git a/tests/test-fine-gpt-eval.cpp b/tests/test-fine-gpt-eval.cpp
deleted file mode 100644
index a7630dc..0000000
--- a/tests/test-fine-gpt-eval.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <string>
-#include <vector>
-
-#include "bark.h"
-#include "common.h"
-
-
-static const std::vector<std::tuple<std::string, int>> test_args = {
-    { "./data/fine_gpt_eval/test_fine_gpt_eval_1.bin", 2 },   // prompt: Hello, my name is Suno. And, uh - and I like pizza. [laughs] But I also have other interests such as playing tic tac toe.
-    { "./data/fine_gpt_eval/test_fine_gpt_eval_2.bin", 3 },   // prompt: Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.
-    { "./data/fine_gpt_eval/test_fine_gpt_eval_3.bin", 4 },   // prompt: ♪ In the jungle, the mighty jungle, the lion barks tonight ♪
-    { "./data/fine_gpt_eval/test_fine_gpt_eval_4.bin", 5 },   // prompt: I have a silky smooth voice, and today I will tell you about the exercise regimen of the common sloth.
-    { "./data/fine_gpt_eval/test_fine_gpt_eval_5.bin", 6 },   // prompt: You cannot, my good sir, take that away from me without having me retaliate in the most ferocious way.
-    { "./data/fine_gpt_eval/test_fine_gpt_eval_6.bin", 7 },   // prompt: C’est un roc ! c’est un pic ! c’est un cap ! Que dis-je, c’est un cap ? C’est une péninsule !
-};
-
-static const int n_threads = 4;
-
-template<typename T>
-std::vector<T> flatten(std::vector<std::vector<T>> const &vec) {
-    std::vector<T> flattened;
-    for (auto const &v: vec) {
-        flattened.insert(flattened.end(), v.begin(), v.end());
-    }
-    return flattened;
-}
-
-int main() {
-    const std::string fname = "../ggml_weights/ggml_weights_fine.bin";
-
-    gpt_model model;
-    if (gpt_model_load(fname, model) > 0) {
-        fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
-        return 1;
-    }
-
-    bark_codes tokens;
-    std::vector<float> gt_logits, logits;
-
-    // dry run to estimate mem_per_token
-    bark_sequence decoy = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-    fine_gpt_eval(model, decoy.data(), decoy.size(), nullptr, n_threads, 2);
-
-    for (int i = 0; i < (int) test_args.size(); i++) {
-        std::string path = std::get<0>(test_args[i]);
-        int codebook_ix  = std::get<1>(test_args[i]);
-
-        tokens.clear();
-        gt_logits.clear();
-        logits.clear();
-
-        load_test_data(path, tokens, gt_logits);
-        tokens = transpose(tokens);
-
-        std::vector<int> tokens_vec = flatten(tokens);
-
-        logits.resize(1024*1056);
-        fine_gpt_eval(model, tokens_vec.data(), tokens_vec.size(), logits.data(), n_threads, codebook_ix);
-
-        printf("\n");
-        printf("%s: %s\n", __func__, path.c_str());
-        if (!run_test(gt_logits, logits)) {
-            printf("%s:     test %d failed.\n", __func__, i+1);
-        } else {
-            printf("%s:     test %d passed.\n", __func__, i+1);
-        }
-    }
-
-    return 0;
-}
\ No newline at end of file
diff --git a/tests/test-forward-coarse.cpp b/tests/test-forward-coarse.cpp
deleted file mode 100644
index 75f7aa8..0000000
--- a/tests/test-forward-coarse.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <cstdio>
-#include <string>
-#include <random>
-#include <vector>
-
-#include "bark.h"
-#include "common.h"
-
-static const std::vector<std::string> test_data = {
-    "./data/coarse/test_pass_coarse_1.bin",   // prompt: The amount of random conversations that lead to culture-shifting ideas is insane.
-    "./data/coarse/test_pass_coarse_2.bin",   // prompt: Des Teufels liebstes Möbelstück ist die lange Bank
-    "./data/coarse/test_pass_coarse_3.bin",   // prompt: खुदा ने बहुत सी अच्छी चीज बनाई है उस में एक हमारा दिमाग भी है बस उसे Use करने के लिए बता देता तो हम भी करोड़पति बन जाते I
-};
-
-static const int n_threads = 4;
-static const int sliding_window_size = 60;
-static const int max_coarse_history  = 630;
-static const float temp = 0.0f;
-
-int main() {
-    const std::string fname = "../ggml_weights/ggml_weights_coarse.bin";
-
-    std::mt19937 rng(0);
-
-    bark_model model;
-
-    if (gpt_model_load(fname, model.coarse_model) > 0) {
-        fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
-        return 1;
-    }
-
-    bark_context * ctx = bark_new_context_with_model(&model);
-    ctx->rng = rng;
-
-    bark_sequence input;
-    bark_codes gt_tokens;
-
-    for (int i = 0; i < (int) test_data.size(); i++) {
-        input.clear();
-        gt_tokens.clear();
-
-        std::string path = test_data[i];
-        load_test_data(path, input, gt_tokens);
-        ctx->semantic_tokens = input;
-
-        bark_forward_coarse_encoder(ctx, max_coarse_history, sliding_window_size, temp, n_threads);
-
-        printf("\n");
-        printf("%s: %s\n", __func__, path.c_str());
-        if (!run_test(transpose(gt_tokens), ctx->coarse_tokens)) {
-            printf("%s:     test %d failed.\n", __func__, i+1);
-        } else {
-            printf("%s:     test %d passed.\n", __func__, i+1);
-        }
-    }
-
-    bark_free(ctx);
-
-    return 0;
-}
diff --git a/tests/test-forward-encodec.cpp b/tests/test-forward-encodec.cpp
deleted file mode 100644
index 5c9ee84..0000000
--- a/tests/test-forward-encodec.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <cstdio>
-#include <string>
-#include <random>
-#include <vector>
-
-#include "bark.h"
-#include "common.h"
-
-static const std::vector<std::string> test_data = {
-    "./data/encodec/test_pass_encodec_1.bin",   // prompt: El hombre que se levanta es aún más grande que el que no ha caído.
-    "./data/encodec/test_pass_encodec_2.bin",   // prompt: ♪ Heal the world, Make it a better place, For you and for me, and the entire human race ♪
-    "./data/encodec/test_pass_encodec_3.bin",   // prompt: En été, mieux vaut suer que trembler.
-};
-
-int main() {
-    const std::string fname = "../ggml_weights/ggml_weights_codec.bin";
-
-    encodec_model model;
-    if (encodec_model_load(fname, model) > 0) {
-        fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
-        return 1;
-    }
-
-    bark_codes tokens;
-    audio_arr_t gt_audio_arr, audio_arr;
-
-    for (int i = 0; i < (int) test_data.size(); i++) {
-        tokens.clear();
-        gt_audio_arr.clear();
-        audio_arr.clear();
-
-        std::string path = test_data[i];
-        load_test_data(path, tokens, gt_audio_arr);
-
-        audio_arr_t audio_arr = bark_forward_encodec(transpose(tokens), model);
-
-        printf("\n");
-        printf("%s: %s\n", __func__, path.c_str());
-        if (!run_test(gt_audio_arr, audio_arr)) {
-            printf("%s:     test %d failed.\n", __func__, i+1);
-        } else {
-            printf("%s:     test %d passed.\n", __func__, i+1);
-        }
-    }
-
-    return 0;
-}
diff --git a/tests/test-forward-fine.cpp b/tests/test-forward-fine.cpp
deleted file mode 100644
index 732f04e..0000000
--- a/tests/test-forward-fine.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <cstdio>
-#include <string>
-#include <random>
-#include <vector>
-
-#include "bark.h"
-#include "common.h"
-
-static const std::vector<std::string> test_data = {
-    "./data/fine/test_pass_fine_1.bin",   // prompt: Peut-on savoir où s'arrête le normal, où commence l'anormal ? Vous pouvez définir ces notions, vous, normalité, anormalité ?
-    "./data/fine/test_pass_fine_2.bin",   // prompt: Brevity is the soul of wit.
-    "./data/fine/test_pass_fine_3.bin",   // prompt: El hombre que se levanta es aún más grande que el que no ha caído.
-};
-
-static const int n_threads = 4;
-static const float temp = 0.0f;
-
-int main() {
-    const std::string fname = "../ggml_weights/ggml_weights_fine.bin";
-
-    std::mt19937 rng(0);
-
-    bark_model model;
-
-    if (gpt_model_load(fname, model.fine_model) > 0) {
-        fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
-        return 1;
-    }
-
-    bark_context * ctx = bark_new_context_with_model(&model);
-    ctx->rng = rng;
-
-    bark_codes input, gt_tokens;
-
-    for (int i = 0; i < (int) test_data.size(); i++) {
-        input.clear();
-        gt_tokens.clear();
-
-        std::string path = test_data[i];
-        load_test_data(path, input, gt_tokens);
-
-        // TODO: need to remove transpose
-        ctx->coarse_tokens = transpose(input);
-        bark_forward_fine_encoder(ctx, temp, n_threads);
-        bark_codes tokens  = transpose(ctx->fine_tokens);
-
-        printf("\n");
-        printf("%s: %s\n", __func__, path.c_str());
-        if (!run_test(gt_tokens, tokens)) {
-            printf("%s:     test %d failed.\n", __func__, i+1);
-        } else {
-            printf("%s:     test %d passed.\n", __func__, i+1);
-        }
-    }
-
-    bark_free(ctx);
-
-    return 0;
-}
diff --git a/tests/test-forward-semantic.cpp b/tests/test-forward-semantic.cpp
deleted file mode 100644
index 8cd2375..0000000
--- a/tests/test-forward-semantic.cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <cstdio>
-#include <string>
-#include <random>
-#include <vector>
-
-#include "bark.h"
-#include "common.h"
-
-static const std::vector<std::string> test_data = {
-    "./data/semantic/test_pass_semantic_1.bin",   // prompt: Ceci est un texte en français pour tester le bon fonctionnement de bark.
-    "./data/semantic/test_pass_semantic_2.bin",   // prompt: Sometimes the heart sees what is invisible to the eye
-    "./data/semantic/test_pass_semantic_3.bin",   // prompt: El Arte de Vencer se Aprende en las Derrotas
-};
-
-static const int   n_threads = 4;
-static const float min_eos_p = 0.2;
-static const float temp      = 0.0f;  // deterministic sampling
-
-int main() {
-    const std::string fname = "../ggml_weights/ggml_weights_text.bin";
-
-    std::mt19937 rng(0);
-
-    bark_model model;
-
-    if (gpt_model_load(fname, model.text_model) > 0) {
-        fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
-        return 1;
-    }
-
-    bark_context * ctx = bark_new_context_with_model(&model);
-    ctx->rng = rng;
-
-    bark_sequence input;
-    bark_sequence gt_tokens;
-
-    for (int i = 0; i < (int) test_data.size(); i++) {
-        input.clear();
-        gt_tokens.clear();
-
-        std::string path = test_data[i];
-        load_test_data(path, input, gt_tokens);
-        ctx->tokens = input;
-
-        bark_forward_text_encoder(ctx, temp, min_eos_p, n_threads);
-
-        printf("\n");
-        printf("%s: %s\n", __func__, path.c_str());
-        if (!run_test(gt_tokens, ctx->semantic_tokens)) {
-            printf("%s:     test %d failed.\n", __func__, i+1);
-        } else {
-            printf("%s:     test %d passed.\n", __func__, i+1);
-        }
-    }
-
-    bark_free(ctx);
-
-    return 0;
-}
diff --git a/tests/test-gpt-eval.cpp b/tests/test-gpt-eval.cpp
deleted file mode 100644
index 805755e..0000000
--- a/tests/test-gpt-eval.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <string>
-#include <vector>
-
-#include "bark.h"
-#include "common.h"
-
-
-static const std::vector<std::tuple<std::string, bool>> test_args = {
-    { "./data/gpt_eval/test_gpt_eval_1_no_merge.bin", false },  // prompt: Hello, my name is Suno. And, uh - and I like pizza. [laughs] But I also have other interests such as playing tic tac toe.
-    { "./data/gpt_eval/test_gpt_eval_2_no_merge.bin", false },  // prompt: Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.
-    { "./data/gpt_eval/test_gpt_eval_3_no_merge.bin", false },  // prompt: ♪ In the jungle, the mighty jungle, the lion barks tonight ♪
-
-    { "./data/gpt_eval/test_gpt_eval_1_merge.bin", true },     // prompt: I have a silky smooth voice, and today I will tell you about the exercise regimen of the common sloth.
-    { "./data/gpt_eval/test_gpt_eval_2_merge.bin", true },     // prompt: You cannot, my good sir, take that away from me without having me retaliate in the most ferocious way.
-    { "./data/gpt_eval/test_gpt_eval_3_merge.bin", true },     // prompt: Ceci est un texte en français pour tester le bon fonctionnement de bark.
-};
-
-static const int n_threads = 4;
-
-int main() {
-    const std::string fname = "../ggml_weights/ggml_weights_text.bin";
-
-    gpt_model model;
-    if (gpt_model_load(fname, model) > 0) {
-        fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
-        return 1;
-    }
-
-    bark_sequence tokens;
-    logit_sequence gt_logits, logits;
-
-    auto & hparams = model.hparams;
-    int n_vocab = hparams.n_out_vocab;
-    logits.resize(n_vocab);
-
-    // dry run to estimate mem_per_token
-    {
-        int n_past = 0;
-        bark_token decoy[4] = { 0, 1, 2, 3 };
-        gpt_eval(model, decoy, 4, nullptr, &n_past, false, n_threads);
-    }
-
-    for (int i = 0; i < (int) test_args.size(); i++) {
-        tokens.clear();
-        gt_logits.clear();
-
-        std::string path = std::get<0>(test_args[i]);
-        bool merge_ctx   = std::get<1>(test_args[i]);
-
-        load_test_data(path, tokens, gt_logits);
-
-        int n_past = 0;
-        gpt_eval(model, tokens.data(), tokens.size(), logits.data(), &n_past, merge_ctx, n_threads);
-
-        printf("\n");
-        printf("%s: %s\n", __func__, path.c_str());
-        if (!run_test(gt_logits, logits)) {
-            printf("%s:     test %d failed.\n", __func__, i+1);
-        } else {
-            printf("%s:     test %d passed.\n", __func__, i+1);
-        }
-    }
-
-    return 0;
-}
\ No newline at end of file