diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index ae78af1..7e2f8f1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -4,10 +4,31 @@ on: push: branches: - main - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu'] + - encodec-submodule-fix-ci + paths: + [ + ".github/workflows/**", + "**/CMakeLists.txt", + "**/Makefile", + "**/*.h", + "**/*.hpp", + "**/*.c", + "**/*.cpp", + "**/*.cu", + ] pull_request: types: [opened, synchronize, reopened] - paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', ".github/workflows/**"] + paths: + [ + "**/CMakeLists.txt", + "**/Makefile", + "**/*.h", + "**/*.hpp", + "**/*.c", + "**/*.cpp", + "**/*.cu", + ".github/workflows/**", + ] env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} @@ -24,7 +45,7 @@ jobs: id: checkout uses: actions/checkout@v4 with: - submodules: true + submodules: recursive - name: Dependencies id: depends @@ -35,6 +56,7 @@ jobs: - name: Build id: cmake_build run: | + cd bark mkdir build cd build cmake .. @@ -48,7 +70,7 @@ jobs: id: checkout uses: actions/checkout@v4 with: - submodules: true + submodules: recursive - name: Dependencies id: depends @@ -60,6 +82,7 @@ jobs: id: cmake_build run: | sysctl -a + cd bark mkdir build cd build cmake .. diff --git a/.gitmodules b/.gitmodules index f76ad7d..6629b72 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "ggml"] - path = ggml - url = https://github.com/ggerganov/ggml.git +[submodule "encodec.cpp"] + path = encodec.cpp + url = https://github.com/PABannier/encodec.cpp diff --git a/.vscode/settings.json b/.vscode/settings.json index 95e18ea..0d930c2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -71,6 +71,13 @@ "algorithm": "cpp", "bit": "cpp", "cinttypes": "cpp", - "codecvt": "cpp" - } + "codecvt": "cpp", + "any": "cpp", + "forward_list": "cpp", + "ranges": "cpp", + "set": "cpp", + "span": "cpp", + "valarray": "cpp" + }, + "cmake.sourceDirectory": "/Users/pbannier/Documents/bark.cpp/bark" } \ No newline at end of file diff --git a/README.md b/README.md index 2b9d088..0a481ce 100644 --- a/README.md +++ b/README.md @@ -9,155 +9,83 @@ Inference of [SunoAI's bark model](https://github.com/suno-ai/bark) in pure C/C++. -**Disclaimer: there remains bug in the inference code, bark is able to generate audio for some prompts or some seeds, -but it does not work for most prompts. The current effort of the community is to fix those bugs, in order to release -v0.0.2**. - ## Description -The main goal of `bark.cpp` is to synthesize audio from a textual input with the [Bark](https://github.com/suno-ai/bark) model in efficiently using only CPU. +With `bark.cpp`, my goal is to bring **real-time realistic multilingual** text-to-speech generation to the community. Currently, I am focused on porting the [Bark](https://github.com/suno-ai/bark) model in C++. -- [X] Plain C/C++ implementation without dependencies -- [X] AVX, AVX2 and AVX512 for x86 architectures -- [X] Mixed F16 / F32 precision -- [X] 4-bit, 5-bit and 8-bit integer quantization -- [ ] Optimized via ARM NEON, Accelerate and Metal frameworks -- [ ] iOS on-device deployment using CoreML +- [x] Plain C/C++ implementation without dependencies +- [x] AVX, AVX2 and AVX512 for x86 architectures +- [x] CPU and GPU compatible backends +- [x] Mixed F16 / F32 precision +- [x] 4-bit, 5-bit and 8-bit integer quantization +- [x] Metal and CUDA backends The original implementation of `bark.cpp` is the bark's 24Khz English model. We expect to support multiple encoders in the future (see [this](https://github.com/PABannier/bark.cpp/issues/36) and [this](https://github.com/PABannier/bark.cpp/issues/6)), as well as music generation model (see [this](https://github.com/PABannier/bark.cpp/issues/62)). This project is for educational purposes. Demo on [Google Colab](https://colab.research.google.com/drive/1JVtJ6CDwxtKfFmEd8J4FGY2lzdL0d0jT?usp=sharing) ([#95](https://github.com/PABannier/bark.cpp/issues/95)) -**Supported platforms:** +--- -- [X] Mac OS -- [X] Linux -- [X] Windows +Here is a typical run using `bark.cpp`: -**Supported models:** +```java +make -j && ./main -p "This is an audio generated by bark.cpp" -- [X] Bark -- [ ] Vocos -- [ ] AudioCraft + __ __ + / /_ ____ ______/ /__ _________ ____ + / __ \/ __ `/ ___/ //_/ / ___/ __ \/ __ \ + / /_/ / /_/ / / / ,< _ / /__/ /_/ / /_/ / +/_.___/\__,_/_/ /_/|_| (_) \___/ .___/ .___/ + /_/ /_/ ---- -Here are typical audio pieces generated by `bark.cpp`: +bark_tokenize_input: prompt: 'this is a dog barking.' +bark_tokenize_input: number of tokens in prompt = 513, first 8 tokens: 20579 20172 10217 27883 28169 25677 10167 129595 -https://github.com/PABannier/bark.cpp/assets/12958149/f9f240fd-975f-4d69-9bb3-b295a61daaff +Generating semantic tokens: [========> ] (17%) -https://github.com/PABannier/bark.cpp/assets/12958149/c0caadfd-bed9-4a48-8c17-3215963facc1 +bark_print_statistics: mem per token = 0.00 MB +bark_print_statistics: sample time = 9.90 ms / 138 tokens +bark_print_statistics: predict time = 3163.78 ms / 22.92 ms per token +bark_print_statistics: total time = 3188.37 ms -Here is a typical run using Bark: +Generating coarse tokens: [==================================================>] (100%) -```java -make -j && ./main -p "this is an audio" -I bark.cpp build info: -I UNAME_S: Darwin -I UNAME_P: arm -I UNAME_M: arm64 -I CFLAGS: -I. -O3 -std=c11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -pthread -DGGML_USE_ACCELERATE -I CXXFLAGS: -I. -O3 -std=c++11 -fPIC -DNDEBUG -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar -pthread -I LDFLAGS: -framework Accelerate -I CC: Apple clang version 14.0.0 (clang-1400.0.29.202) -I CXX: Apple clang version 14.0.0 (clang-1400.0.29.202) - -bark_model_load: loading model from './ggml_weights' -bark_model_load: reading bark text model -gpt_model_load: n_in_vocab = 129600 -gpt_model_load: n_out_vocab = 10048 -gpt_model_load: block_size = 1024 -gpt_model_load: n_embd = 1024 -gpt_model_load: n_head = 16 -gpt_model_load: n_layer = 24 -gpt_model_load: n_lm_heads = 1 -gpt_model_load: n_wtes = 1 -gpt_model_load: ggml tensor size = 272 bytes -gpt_model_load: ggml ctx size = 1894.87 MB -gpt_model_load: memory size = 192.00 MB, n_mem = 24576 -gpt_model_load: model size = 1701.69 MB -bark_model_load: reading bark vocab - -bark_model_load: reading bark coarse model -gpt_model_load: n_in_vocab = 12096 -gpt_model_load: n_out_vocab = 12096 -gpt_model_load: block_size = 1024 -gpt_model_load: n_embd = 1024 -gpt_model_load: n_head = 16 -gpt_model_load: n_layer = 24 -gpt_model_load: n_lm_heads = 1 -gpt_model_load: n_wtes = 1 -gpt_model_load: ggml tensor size = 272 bytes -gpt_model_load: ggml ctx size = 1443.87 MB -gpt_model_load: memory size = 192.00 MB, n_mem = 24576 -gpt_model_load: model size = 1250.69 MB - -bark_model_load: reading bark fine model -gpt_model_load: n_in_vocab = 1056 -gpt_model_load: n_out_vocab = 1056 -gpt_model_load: block_size = 1024 -gpt_model_load: n_embd = 1024 -gpt_model_load: n_head = 16 -gpt_model_load: n_layer = 24 -gpt_model_load: n_lm_heads = 7 -gpt_model_load: n_wtes = 8 -gpt_model_load: ggml tensor size = 272 bytes -gpt_model_load: ggml ctx size = 1411.25 MB -gpt_model_load: memory size = 192.00 MB, n_mem = 24576 -gpt_model_load: model size = 1218.26 MB - -bark_model_load: reading bark codec model -encodec_model_load: model size = 44.32 MB - -bark_model_load: total model size = 74.64 MB - -bark_generate_audio: prompt: 'this is an audio' -bark_generate_audio: number of tokens in prompt = 513, first 8 tokens: 20579 20172 20199 33733 129595 129595 129595 129595 -bark_forward_text_encoder: ........................................................................................................... - -bark_forward_text_encoder: mem per token = 4.80 MB -bark_forward_text_encoder: sample time = 7.91 ms -bark_forward_text_encoder: predict time = 2779.49 ms / 7.62 ms per token -bark_forward_text_encoder: total time = 2829.35 ms - -bark_forward_coarse_encoder: ................................................................................................................................................................. -.................................................................................................................................................................. - -bark_forward_coarse_encoder: mem per token = 8.51 MB -bark_forward_coarse_encoder: sample time = 3.08 ms -bark_forward_coarse_encoder: predict time = 10997.70 ms / 33.94 ms per token -bark_forward_coarse_encoder: total time = 11036.88 ms - -bark_forward_fine_encoder: ..... - -bark_forward_fine_encoder: mem per token = 5.11 MB -bark_forward_fine_encoder: sample time = 39.85 ms -bark_forward_fine_encoder: predict time = 19773.94 ms -bark_forward_fine_encoder: total time = 19873.72 ms - - - -bark_forward_encodec: mem per token = 760209 bytes -bark_forward_encodec: predict time = 528.46 ms / 528.46 ms per token -bark_forward_encodec: total time = 663.63 ms +bark_print_statistics: mem per token = 0.00 MB +bark_print_statistics: sample time = 3.96 ms / 410 tokens +bark_print_statistics: predict time = 14303.32 ms / 34.89 ms per token +bark_print_statistics: total time = 14315.52 ms -Number of frames written = 51840. +Generating fine tokens: [==================================================>] (100%) +bark_print_statistics: mem per token = 0.00 MB +bark_print_statistics: sample time = 41.93 ms / 6144 tokens +bark_print_statistics: predict time = 15234.38 ms / 2.48 ms per token +bark_print_statistics: total time = 15282.15 ms + +Number of frames written = 51840. main: load time = 1436.36 ms main: eval time = 34520.53 ms -main: total time = 35956.92 ms +main: total time = 32786.04 ms ``` +Here are typical audio pieces generated by `bark.cpp`: + +https://github.com/PABannier/bark.cpp/assets/12958149/f9f240fd-975f-4d69-9bb3-b295a61daaff + +https://github.com/PABannier/bark.cpp/assets/12958149/c0caadfd-bed9-4a48-8c17-3215963facc1 + ## Usage -Here are the steps for the bark model. +Here are the steps to use Bark.cpp ### Get the code ```bash git clone --recursive https://github.com/PABannier/bark.cpp.git cd bark.cpp +git submodule update --init --recursive ``` ### Build @@ -165,8 +93,8 @@ cd bark.cpp In order to build bark.cpp you must use `CMake`: ```bash -mkdir build -cd build +mkdir bark/build +cd bark/build cmake .. cmake --build . --config Release ``` @@ -175,43 +103,43 @@ cmake --build . --config Release ```bash # install Python dependencies -python3 -m pip install -r requirements.txt +python3 -m pip install -r bark/requirements.txt # obtain the original bark and encodec weights and place them in ./models -python3 download_weights.py --download-dir ./models +python3 bark/download_weights.py --download-dir ./models # convert the model to ggml format -python3 convert.py \ +python3 bark/convert.py \ --dir-model ./models \ - --codec-path ./models \ --vocab-path ./ggml_weights/ \ --out-dir ./ggml_weights/ # run the inference -./main -m ./ggml_weights/ -p "this is an audio" +./bark/build/examples/main/main -m ./ggml_weights/ -p "this is an audio" ``` ### (Optional) Quantize weights Weights can be quantized using the following strategy: `q4_0`, `q4_1`, `q5_0`, `q5_1`, `q8_0`. -Note that to preserve audio quality, we do not quantize the codec model. The bulk of the -computation is in the forward pass of the GPT models. +Note that to preserve audio quality, we do not quantize the codec model. The bulk of the computation is in the forward pass of the GPT models. ```bash -./quantize ./ggml_weights/ggml_weights_text.bin ./ggml_weights_q4/ggml_weights_text.bin q4_0 -./quantize ./ggml_weights/ggml_weights_coarse.bin ./ggml_weights_q4/ggml_weights_coarse.bin q4_0 -./quantize ./ggml_weights/ggml_weights_fine.bin ./ggml_weights_q4/ggml_weights_fine.bin q4_0 +mkdir ggml_weights_q4 +cp ggml_weights/*vocab* ggml_weights_q4 +./bark/build/examples/quantize/quantize ./ggml_weights/ggml_weights_text.bin ./ggml_weights_q4/ggml_weights_text.bin q4_0 +./bark/build/examples/quantize/quantize ./ggml_weights/ggml_weights_coarse.bin ./ggml_weights_q4/ggml_weights_coarse.bin q4_0 +./bark/build/examples/quantize/quantize ./ggml_weights/ggml_weights_fine.bin ./ggml_weights_q4/ggml_weights_fine.bin q4_0 ``` -### Seminal papers and background on models +### Seminal papers - Bark - - [Text Prompted Generative Audio](https://github.com/suno-ai/bark) + - [Text Prompted Generative Audio](https://github.com/suno-ai/bark) - Encodec - - [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) + - [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) - GPT-3 - - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) + - [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) ### Contributing @@ -225,5 +153,3 @@ computation is in the forward pass of the GPT models. - Avoid adding third-party dependencies, extra files, extra headers, etc. - Always consider cross-compatibility with other operating systems and architectures -- Avoid fancy looking modern STL constructs, keep it simple -- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & ref` diff --git a/bark-util.h b/bark-util.h deleted file mode 100644 index 50fd1e3..0000000 --- a/bark-util.h +++ /dev/null @@ -1,30 +0,0 @@ -#pragma once - -#include - -#define BARK_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "BARK_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - -static const size_t MB = 4*1024*1024; - -template -static void read_safe(std::ifstream& fin, T& dest) { - fin.read((char*)& dest, sizeof(T)); -} - -template -static void write_safe(std::ofstream& fout, T& dest) { - fout.write((char*)& dest, sizeof(T)); -} - - -static size_t utf8_len(char src) { - const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; - uint8_t highbits = static_cast(src) >> 4; - return lookup[highbits]; -} diff --git a/bark.cpp b/bark.cpp deleted file mode 100644 index 62173f8..0000000 --- a/bark.cpp +++ /dev/null @@ -1,2200 +0,0 @@ -/* -Port of Suno's Bark to C/C++. - -Author: Pierre-Antoine Bannier -*/ -#include "bark.h" -#include "ggml.h" -#include "bark-util.h" - -// third-party utilities -#define DR_WAV_IMPLEMENTATION -#include "dr_wav.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define BARK_DEBUG 0 -#define EPS_NORM 1e-8 - -typedef std::vector bark_sequence; -typedef std::vector audio_arr_t; - -typedef std::vector> bark_codes; - -struct gpt_hparams { - int32_t n_in_vocab; - int32_t n_out_vocab; - int32_t n_layer; - int32_t n_head; - int32_t n_embd; - int32_t block_size; - int32_t n_lm_heads; - int32_t n_wtes; - int32_t ftype; - - int32_t n_codes_given = 1; -}; - -struct bark_vocab { - using id = int32_t; - using token = std::string; - - std::map token_to_id; - std::map id_to_token; -}; - -struct gpt_layer { - // normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // attention - struct ggml_tensor * c_attn_attn_w; - struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // mlp - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; -}; - -struct gpt_model { - gpt_hparams hparams; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wpe; - - std::vector wtes; - std::vector lm_heads; - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - std::map tensors; - - // - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - int64_t t_main_us = 0; - - // - int64_t n_sample = 0; - int64_t n_predict = 0; - - // - int64_t memsize = 0; - size_t mem_per_token = 0; -}; - -struct bark_model { - // encoder - gpt_model coarse_model; - gpt_model fine_model; - gpt_model text_model; - - // decoder - encodec_model codec_model; - - // vocab - bark_vocab vocab; - - int64_t memsize = 0; -}; - -struct bark_context { - bark_context(bark_model & model) : model(model) {} - ~bark_context() { - if (model_owner) { - delete &model; - } - } - - std::mt19937 rng; - - bark_model & model; - - bool model_owner = false; - - int64_t t_load_us; - int64_t t_start_us; - - bark_sequence tokens; - bark_sequence semantic_tokens; - - bark_codes coarse_tokens; - bark_codes fine_tokens; - - audio_arr_t audio_arr; - - float temp; - float fine_temp; - - float min_eos_p; - int sliding_window_size; - int max_coarse_history; -}; - -struct bark_progress { - float current = 0.0f; - const char * func; - - bark_progress(const char * func): func(func) {} - - void callback(float progress) { - float percentage = progress * 100; - if (percentage == 0.0f) { - fprintf(stderr, "%s: ", func); - } - while (percentage > current) { - current = percentage; - fprintf(stderr, "."); - fflush(stderr); - if (percentage >= 100) { - fprintf(stderr, "\n"); - } - } - } -}; - -struct bark_context * bark_new_context_with_model( - struct bark_model * model, - struct bark_context_params params) { - - if (!model) { - return nullptr; - } - - bark_context * ctx = new bark_context(*model); - - ctx->rng = std::mt19937(params.seed); - - ctx->temp = params.temp; - ctx->fine_temp = params.fine_temp; - - ctx->max_coarse_history = params.max_coarse_history; - ctx->sliding_window_size = params.sliding_window_size; - ctx->min_eos_p = params.min_eos_p; - - return ctx; -} - -struct bark_context_params bark_context_default_params() { - struct bark_context_params result = { - /*.seed =*/ 0, - /*.temp =*/ 0.7, - /*.fine_temp =*/ 0.5, - /*.min_eos_p =*/ 0.2, - /*.sliding_window_size =*/ 60, - /*.max_coarse_history =*/ 630, - }; - - return result; -} - -void bark_seed_rng(struct bark_context * ctx, int32_t seed) { - if (ctx) { - ctx->rng.seed(seed); - } -} - -int bark_vocab_load( - const char * fname, - bark_vocab * vocab, - int32_t expected_size) { - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: faield to open '%s'\n", __func__, fname); - return 1; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname); - return 1; - } - } - - int32_t n_vocab; - read_safe(fin, n_vocab); - - // 5 special tokens: [UNK, SEP, MASK, PAD, CLS] - if (n_vocab != expected_size) { - fprintf(stderr, "%s: wrong voculary size (%d != %d)\n", __func__, n_vocab, expected_size); - return 1; - } - - std::string word; - std::vector tmp; - - tmp.reserve(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - read_safe(fin, len); - - if (len > 0) { - tmp.resize(len); - fin.read(&tmp[0], tmp.size()); // read to buffer - word.assign(&tmp[0], tmp.size()); - } else { - word = ""; - } - - vocab->token_to_id[word] = i; - vocab->id_to_token[i] = word; - } - - return 0; -} - -int gpt_model_load(const std::string& fname, gpt_model& model) { - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return 1; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return 1; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - read_safe(fin, hparams.n_layer); - read_safe(fin, hparams.n_head); - read_safe(fin, hparams.n_embd); - read_safe(fin, hparams.block_size); - read_safe(fin, hparams.n_in_vocab); - read_safe(fin, hparams.n_out_vocab); - read_safe(fin, hparams.n_lm_heads); - read_safe(fin, hparams.n_wtes); - read_safe(fin, hparams.ftype); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: n_in_vocab = %d\n", __func__, hparams.n_in_vocab); - printf("%s: n_out_vocab = %d\n", __func__, hparams.n_out_vocab); - printf("%s: block_size = %d\n", __func__, hparams.block_size); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: n_lm_heads = %d\n", __func__, hparams.n_lm_heads); - printf("%s: n_wtes = %d\n", __func__, hparams.n_wtes); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return 1; - } - - auto & ctx = model.ctx; - - size_t ctx_size = 0; - - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int block_size = hparams.block_size; - const int n_in_vocab = hparams.n_in_vocab; - const int n_out_vocab = hparams.n_out_vocab; - const int n_lm_heads = hparams.n_lm_heads; - const int n_wtes = hparams.n_wtes; - - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g - ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b - - ctx_size += n_wtes*n_in_vocab*n_embd*ggml_type_sizef(wtype); // wte - ctx_size += block_size*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe - ctx_size += n_lm_heads*n_out_vocab*n_embd*ggml_type_sizef(wtype); // lm_head - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b - - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g - ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b - - ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w - ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b - - ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w - ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b - - ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w - ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b - - ctx_size += block_size*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k - ctx_size += block_size*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v - - ctx_size += (6 + 12*n_layer)*512; // object overhead - - printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); - printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); - } - - // create the ggml context - { - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ false, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return 1; - } - } - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int block_size = hparams.block_size; - const int n_in_vocab = hparams.n_in_vocab; - const int n_out_vocab = hparams.n_out_vocab; - const int n_lm_heads = hparams.n_lm_heads; - const int n_wtes = hparams.n_wtes; - - model.layers.resize(n_layer); - model.lm_heads.resize(n_lm_heads); - model.wtes.resize(n_wtes); - - model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, block_size); - - for (int i = 0; i < n_wtes; i++) { - model.wtes[i] = ggml_new_tensor_2d(ctx, wtype, n_embd, n_in_vocab); - model.tensors["model/wte/" + std::to_string(i)] = model.wtes[i]; - } - - for (int i = 0; i < n_lm_heads; i++) { - model.lm_heads[i] = ggml_new_tensor_2d(ctx, wtype, n_embd, n_out_vocab); - model.tensors["model/lm_head/" + std::to_string(i)] = model.lm_heads[i]; - } - - model.tensors["model/ln_f/g"] = model.ln_f_g; - model.tensors["model/ln_f/b"] = model.ln_f_b; - - model.tensors["model/wpe"] = model.wpe; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - - layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); - - layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); - - layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // map by name - model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; - model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; - - model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; - model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; - - model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; - model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; - - model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; - model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; - - model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; - model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; - - model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; - model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; - } - } - - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int block_size = hparams.block_size; - - const int n_mem = n_layer*block_size; - const int n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - - // load weights - { - size_t total_size = 0; - - while(true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - read_safe(fin, n_dims); - read_safe(fin, length); - read_safe(fin, ttype); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - read_safe(fin, ne[i]); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name.data()) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); - return 1; - } - - auto tensor = model.tensors[name.data()]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); - return 1; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", - __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return 1; - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); - return 1; - } - - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - - // printf("%48s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], "float", ggml_nbytes(tensor)/1024.0/1024.0); - - total_size += ggml_nbytes(tensor); - } - - printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); - model.memsize = total_size; - } - - fin.close(); - - return 0; -} - -struct bark_model * bark_load_model_from_file(const char * dirname) { - printf("%s: loading model from '%s'\n", __func__, dirname); - - bark_model * model = new bark_model; - - // text - { - printf("%s: reading bark text model\n", __func__); - const std::string fname = std::string(dirname) + "/ggml_weights_text.bin"; - if (gpt_model_load(fname, model->text_model) > 0) { - fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str()); - return nullptr; - } - model->memsize += model->text_model.memsize; - } - - // vocab - { - printf("%s: reading bark vocab\n", __func__); - const std::string fname = std::string(dirname) + "/ggml_vocab.bin"; - const gpt_hparams hparams = model->text_model.hparams; - const int32_t expected_size = hparams.n_in_vocab - hparams.n_out_vocab - 5; - if (bark_vocab_load(fname.c_str(), &model->vocab, expected_size) > 0) { - fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str()); - return nullptr; - } - } - - // coarse - { - printf("\n%s: reading bark coarse model\n", __func__); - const std::string fname = std::string(dirname) + "/ggml_weights_coarse.bin"; - if (gpt_model_load(fname, model->coarse_model) > 0) { - fprintf(stderr, "%s: invalid model file '%s' (bad coarse)\n", __func__, fname.c_str()); - return nullptr; - } - model->memsize += model->coarse_model.memsize; - } - - // fine - { - printf("\n%s: reading bark fine model\n", __func__); - const std::string fname = std::string(dirname) + "/ggml_weights_fine.bin"; - if (gpt_model_load(fname, model->fine_model) > 0) { - fprintf(stderr, "%s: invalid model file '%s' (bad fine)\n", __func__, fname.c_str()); - return nullptr; - } - model->memsize += model->fine_model.memsize; - } - - // codec - { - printf("\n%s: reading bark codec model\n", __func__); - const std::string fname = std::string(dirname) + "/ggml_weights_codec.bin"; - if (encodec_model_load(fname, model->codec_model) > 0) { - fprintf(stderr, "%s: invalid model file '%s' (bad codec)\n", __func__, fname.c_str()); - return nullptr; - } - model->memsize += model->codec_model.memsize; - } - - printf("\n%s: total model size = %8.2f MB\n", __func__, model->memsize/1024.0/1024.0); - - return model; -} - -int ggml_common_quantize_0( - std::ifstream & fin, - std::ofstream & fout, - const ggml_ftype ftype, - const std::vector & to_quant, - const std::vector & to_skip) { - - ggml_type qtype = GGML_TYPE_F32; - - switch (ftype) { - case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break; - case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break; - case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; - case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; - case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; - case GGML_FTYPE_UNKNOWN: - case GGML_FTYPE_ALL_F32: - case GGML_FTYPE_MOSTLY_F16: - case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: - case GGML_FTYPE_MOSTLY_Q2_K: - case GGML_FTYPE_MOSTLY_Q3_K: - case GGML_FTYPE_MOSTLY_Q4_K: - case GGML_FTYPE_MOSTLY_Q5_K: - case GGML_FTYPE_MOSTLY_Q6_K: - { - fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); - return 1; - } - }; - - if (!ggml_is_quantized(qtype)) { - fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype)); - return 1; - } - - size_t total_size_org = 0; - size_t total_size_new = 0; - - std::vector work; - - std::vector data_u8; - std::vector data_f16; - std::vector data_f32; - - std::vector hist_all(1 << 4, 0); - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - read_safe(fin, n_dims); - read_safe(fin, length); - read_safe(fin, ttype); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[4] = { 1, 1, 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - read_safe(fin, ne[i]); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype)); - - bool quantize = false; - - // check if we should quantize this tensor - for (const auto & s : to_quant) { - if (std::regex_match(name, std::regex(s))) { - quantize = true; - break; - } - } - - // check if we should skip this tensor - for (const auto & s : to_skip) { - if (std::regex_match(name, std::regex(s))) { - quantize = false; - break; - } - } - - // quantize only 2D tensors - quantize &= (n_dims == 2); - - if (quantize) { - if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) { - fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); - return 1; - } - - if (ttype == GGML_TYPE_F16) { - data_f16.resize(nelements); - fin.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); - data_f32.resize(nelements); - for (int i = 0; i < nelements; ++i) { - data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); - } - } else { - data_f32.resize(nelements); - fin.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); - } - - ttype = qtype; - } else { - const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t); - - data_u8.resize(nelements*bpe); - fin.read(reinterpret_cast(data_u8.data()), nelements * bpe); - } - - write_safe(fout, n_dims); - write_safe(fout, length); - write_safe(fout, ttype); - - for (int i = 0; i < n_dims; ++i) { - write_safe(fout, ne[i]); - } - fout.write(&name[0], length); - - if (quantize) { - work.resize(nelements); // for quantization - - size_t cur_size = 0; - std::vector hist_cur(1 << 4, 0); - - switch ((ggml_type) ttype) { - case GGML_TYPE_Q4_0: - { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q4_1: - { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q5_0: - { - cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q5_1: - { - cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_Q8_0: - { - cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); - } break; - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: - case GGML_TYPE_Q8_1: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_Q8_K: - case GGML_TYPE_COUNT: - { - fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); - return 1; - } - } - - fout.write(reinterpret_cast(work.data()), cur_size); - total_size_new += cur_size; - - printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); - for (int i = 0; i < (int) hist_cur.size(); ++i) { - hist_all[i] += hist_cur[i]; - } - - for (int i = 0; i < (int) hist_cur.size(); ++i) { - printf("%5.3f ", hist_cur[i] / (float)nelements); - } - printf("\n"); - } else { - printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); - fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); - total_size_new += data_u8.size(); - } - - total_size_org += nelements * sizeof(float); - } - - printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); - printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype)); - - { - int64_t sum_all = 0; - for (int i = 0; i < (int) hist_all.size(); ++i) { - sum_all += hist_all[i]; - } - - printf("%s: hist: ", __func__); - for (int i = 0; i < (int) hist_all.size(); ++i) { - printf("%5.3f ", hist_all[i] / (float)sum_all); - } - printf("\n"); - } - - return 0; -} - -int bark_model_quantize( - const char * fname_inp, - const char * fname_out, - ggml_ftype ftype) { - printf("%s: loading model from '%s'\n", __func__, fname_inp); - - gpt_model model; - - auto fin = std::ifstream(fname_inp, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp); - return 1; - } - - auto fout = std::ofstream(fname_out, std::ios::binary); - if (!fout) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out); - return 1; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp); - return 1; - } - - fout.write((char *) &magic, sizeof(magic)); - } - - gpt_hparams hparams; - - // load hparams - { - auto & hparams = model.hparams; - - read_safe(fin, hparams.n_layer); - read_safe(fin, hparams.n_head); - read_safe(fin, hparams.n_embd); - read_safe(fin, hparams.block_size); - read_safe(fin, hparams.n_in_vocab); - read_safe(fin, hparams.n_out_vocab); - read_safe(fin, hparams.n_lm_heads); - read_safe(fin, hparams.n_wtes); - read_safe(fin, hparams.ftype); - - const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; - int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; - - printf("%s: n_in_vocab = %d\n", __func__, hparams.n_in_vocab); - printf("%s: n_out_vocab = %d\n", __func__, hparams.n_out_vocab); - printf("%s: block_size = %d\n", __func__, hparams.block_size); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: n_lm_heads = %d\n", __func__, hparams.n_lm_heads); - printf("%s: n_wtes = %d\n", __func__, hparams.n_wtes); - printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); - printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); - printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); - printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); - - write_safe(fout, hparams.n_layer); - write_safe(fout, hparams.n_head); - write_safe(fout, hparams.n_embd); - write_safe(fout, hparams.block_size); - write_safe(fout, hparams.n_in_vocab); - write_safe(fout, hparams.n_out_vocab); - write_safe(fout, hparams.n_lm_heads); - write_safe(fout, hparams.n_wtes); - write_safe(fout, ftype_dst); - } - - // regexes of tensor names to be quantized - const std::vector to_quant = { - "model/wte/.*", - "model/lm_head/.*", - "model/h.*/attn/c_attn/w", - "model/h.*/attn/c_proj/w", - "model/h.*/mlp/c_fc/w", - "model/h.*/mlp/c_proj/w", - }; - - if (ggml_common_quantize_0(fin, fout, ftype, to_quant, {}) > 0) { - fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp); - return 1; - } - - fin.close(); - fout.close(); - - return 0; -} - -std::string strip_accents(const std::string &in_str) { - std::string out_str; - std::map accent_map = {{"À", 'A'},{"Á", 'A'}, - {"Â", 'A'},{"Ã", 'A'},{"Ä", 'A'},{"Å", 'A'},{"à", 'a'},{"á", 'a'}, - {"â", 'a'},{"ã", 'a'},{"ä", 'a'},{"å", 'a'},{"È", 'E'},{"É", 'E'}, - {"Ê", 'E'},{"Ë", 'E'},{"è", 'e'},{"é", 'e'},{"ê", 'e'},{"ë", 'e'}, - {"Ì", 'I'},{"Í", 'I'},{"Î", 'I'},{"Ï", 'I'},{"ì", 'i'},{"í", 'i'}, - {"î", 'i'},{"ï", 'i'},{"Ò", 'O'},{"Ó", 'O'},{"Ô", 'O'},{"Õ", 'O'}, - {"Ö", 'O'},{"ò", 'o'},{"ó", 'o'},{"ô", 'o'},{"õ", 'o'},{"ö", 'o'}, - {"Ù", 'U'},{"Ú", 'U'},{"Û", 'U'},{"Ü", 'U'},{"ù", 'u'},{"ú", 'u'}, - {"û", 'u'},{"ü", 'u'},{"Ý", 'Y'},{"ý", 'y'},{"Ç", 'C'},{"ç", 'c'}, - {"Ñ", 'N'},{"ñ", 'n'}, - }; - - for (size_t i = 0; i < in_str.length();) { - int len = utf8_len(in_str[i]); - std::string cur = in_str.substr(i, len); - auto iter = accent_map.find(cur); - if (iter != accent_map.end()) - out_str += iter->second; - else - out_str += cur; - - i += len; - } - - return out_str; -} - -void bert_tokenize( - const bark_vocab * vocab, - const char * text, - int32_t * tokens, - int32_t * n_tokens, - int32_t n_max_tokens) { - std::string str = text; - std::vector words; - - int32_t t = 0; - - auto * token_map = &vocab->token_to_id; - - // split the text into words - { - str = strip_accents(text); - - std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)"; - - std::regex re(pat); - std::smatch m; - - while (std::regex_search(str, m, re)) { - for (std::string x : m) - words.push_back(x); - str = m.suffix(); - } - } - - // apply wordpiece - for (const auto &word : words) { - if (word.size() == 0) - continue; - - std::string prefix = ""; - int i = 0; - int n = word.size(); - - loop: - while (i < n) { - if (t >= n_max_tokens - 1) - break; - int j = n; - while (j > i) { - auto it = token_map->find(prefix + word.substr(i, j - i)); - if (it != token_map->end()) { - tokens[t++] = it->second; - i = j; - prefix = "##"; - goto loop; - } - --j; - } - if (j == i) { - fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); - prefix = "##"; - ++i; - } - } - } - - *n_tokens = t; -} - -static struct ggml_cgraph * bark_build_fine_gpt_graph( - ggml_context * ctx0, - gpt_model * model, - bark_token * tokens, - int n_tokens, - int codebook_ix) { - // tokens: [n_channels, N] - const int N = n_tokens/N_FINE_CODEBOOKS; - const int n_channels = N_FINE_CODEBOOKS; - - const auto & hparams = model->hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.block_size; - const int n_head = hparams.n_head; - - const int n_codes_given = hparams.n_codes_given; - - BARK_ASSERT(N <= n_ctx); - BARK_ASSERT(codebook_ix > 0); - - struct ggml_cgraph * gf = ggml_new_graph(ctx0); - - struct ggml_tensor * inpL; - struct ggml_tensor * cur; - - struct ggml_tensor * input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, N, n_channels); - memcpy(input->data, tokens, N*n_channels*ggml_element_size(input)); - ggml_set_name(input, "input_tokens"); - - struct ggml_tensor * tok_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); - ggml_set_name(tok_emb, "token_embeddings"); - ggml_set_zero(tok_emb); - - for (int wte_ix = 0; wte_ix < codebook_ix + 1; wte_ix++) { - struct ggml_tensor * cur = ggml_get_rows(ctx0, - model->wtes[wte_ix], - ggml_view_1d(ctx0, input, N, wte_ix*input->nb[1])); - tok_emb = ggml_add(ctx0, tok_emb, cur); - } - - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = i; - } - struct ggml_tensor * pos_emb = ggml_get_rows(ctx0, model->wpe, position); - ggml_set_name(pos_emb, "position_embeddings"); - - // wte + wpe - inpL = ggml_add(ctx0, tok_emb, pos_emb); - - struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); - ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); - ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); - - for (int il = 0; il < n_layer; il++) { - ggml_format_name(inpL, "layer_inp_%d", il); - - // norm - { - cur = ggml_norm(ctx0, inpL, EPS_NORM); - ggml_set_name(cur, "norm_0"); - - // cur = ln_1_g*cur + ln_1_b - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ln_1_g, cur), - cur), - ggml_repeat(ctx0, model->layers[il].ln_1_b, cur)); - ggml_set_name(cur, "layer_norm_0"); - } - - // self-attention - { - // cur = attn_w*cur - cur = ggml_mul_mat(ctx0, model->layers[il].c_attn_attn_w, cur); - ggml_set_name(cur, "attn_in_proj"); - - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - ggml_set_name(Qcur, "Qcur"); - - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - ggml_set_name(Kcur, "Kcur"); - - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - ggml_set_name(Vcur, "Vcur"); - - // [n_embd/n_head, N, n_head] - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), - 0, 2, 1, 3); - ggml_set_name(Q, "Q"); - - // [n_embd/n_head, N, n_head] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Kcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), - 0, 2, 1, 3); - ggml_set_name(K, "K"); - - // [N, N, n_head] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_set_name(KQ, "KQ"); - - // [N, N, n_head] - struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); - ggml_set_name(KQ_scaled, "KQ_scaled"); - - // [N, N, n_head] - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled); - ggml_set_name(KQ_soft_max, "KQ_soft_max"); - - // [N, n_embd/n_head, n_head] - struct ggml_tensor * V_trans = - ggml_cont(ctx0, - ggml_permute(ctx0, - ggml_cpy(ctx0, - Vcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), - 1, 2, 0, 3)); - ggml_set_name(V_trans, "V_trans"); - - // [n_embd/n_head, N, n_head] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); - ggml_set_name(KQV, "KQV"); - - // [n_embd/n_head, n_head, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_set_name(KQV_merged, "KQV_merged"); - - // [n_embd, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - ggml_set_name(cur, "KQV_merged_contiguous"); - - // cur = proj_w*cur - cur = ggml_mul_mat(ctx0, - model->layers[il].c_attn_proj_w, - cur); - ggml_set_name(cur, "attn_out_proj"); - } - - // residual connection - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); - ggml_set_name(inpFF, "inpFF"); - - // feed-forward - { - // norm - { - cur = ggml_norm(ctx0, inpFF, EPS_NORM); - ggml_set_name(cur, "norm_1"); - - // cur = ln_2_g*cur + ln_2_b - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ln_2_g, cur), - cur), - ggml_repeat(ctx0, model->layers[il].ln_2_b, cur)); - ggml_set_name(cur, "ffn_norm"); - } - - // cur = fc_w*cur - cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_fc_w, cur); - ggml_set_name(cur, "ffn_fc"); - - // GELU activation - cur = ggml_gelu(ctx0, cur); - ggml_set_name(cur, "ffn_gelu"); - - // cur = proj_w*cur - cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_proj_w, cur); - ggml_set_name(cur, "ffn_out_proj"); - } - - cur = ggml_add(ctx0, cur, inpFF); - ggml_set_name(cur, "inpFF_+_outFF"); - - // input for next layer - inpL = cur; - } - - cur = inpL; - - // norm - { - cur = ggml_norm(ctx0, cur, EPS_NORM); - ggml_set_name(cur, "norm_final"); - - // cur = ln_f_g*cur + ln_f_b - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model->ln_f_g, cur), - cur), - ggml_repeat(ctx0, model->ln_f_b, cur)); - ggml_set_name(cur, "result_norm"); - } - - // cur = WTE * cur - struct ggml_tensor * lm_head = model->lm_heads[codebook_ix - n_codes_given]; - cur = ggml_mul_mat(ctx0, lm_head, cur); - ggml_set_name(cur, "result_output"); - - ggml_build_forward_expand(gf, cur); - - return gf; -} - -int fine_gpt_eval( - gpt_model * model, - bark_token * tokens, - int n_tokens, - float * logits, - int n_threads, - int codebook_ix) { - // tokens: [n_channels, seq_length], sequences are contiguous - int64_t t_predict_start_us = ggml_time_us(); - - const int N = n_tokens/8; - const int n_channels = 8; - - const auto & hparams = model->hparams; - - const int n_vocab = hparams.n_out_vocab; - - GGML_ASSERT((N > 1) && (n_channels == 8)); - GGML_ASSERT(n_threads > 0); - - static size_t buf_size = 256u*1024*1024; - static void * buf = malloc(buf_size); - - if (model->mem_per_token > 0 && model->mem_per_token*n_tokens > buf_size) { - const size_t buf_size_new = 1.2*(model->mem_per_token*n_tokens); // add 20% to account for ggml object overhead - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return 1; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - ggml_cgraph * gf = bark_build_fine_gpt_graph(ctx0, model, tokens, n_tokens, codebook_ix); - - struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; - struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; - - GGML_ASSERT(strcmp(res->name, "result_output") == 0); - GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); - - // run the computation - ggml_graph_compute_with_ctx(ctx0, gf, n_threads); - - if (logits != NULL) { - // [N, n_vocab] - // [1024, 1056] - memcpy(logits, (float *) ggml_get_data(res), sizeof(float)*N*n_vocab); - } - - if (model->mem_per_token == 0) { - model->mem_per_token = ggml_used_mem(ctx0)/n_tokens; - } - - ggml_free(ctx0); - - int64_t t_predict_end_us = ggml_time_us(); - model->t_predict_us += (t_predict_end_us - t_predict_start_us); - model->n_predict += 1; - - return 0; -} - -bool gpt_eval( - gpt_model * model, - bark_token * tokens, - int n_tokens, - float * logits, - int * n_past, - bool merge_ctx, - int n_threads) { - BARK_ASSERT(n_past != NULL); - - int64_t t_predict_start_us = ggml_time_us(); - - int N = n_tokens; - - const auto & hparams = model->hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.block_size; - const int n_head = hparams.n_head; - const int n_vocab = hparams.n_out_vocab; - - static size_t buf_size = 256u*1024*1024; - static void * buf = malloc(buf_size); - - if (model->mem_per_token > 0 && model->mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.2*(model->mem_per_token*N); // add 20% to account for ggml object overhead - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return 1; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; - - struct ggml_tensor * input = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(input->data, tokens, N*ggml_element_size(input)); - - struct ggml_tensor * tok_emb; - - if (*n_past > 0) { - BARK_ASSERT(N == 1); - tok_emb = ggml_get_rows(ctx0, model->wtes[0], input); - } else { - if (merge_ctx) { - BARK_ASSERT(N == 256+256+1); - N -= 256; - } else { - BARK_ASSERT(N <= n_ctx); - } - - if (merge_ctx) { - struct ggml_tensor * seq_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input, 256, 0)); - struct ggml_tensor * ctx_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input, 256, 256*ggml_element_size(input))); - struct ggml_tensor * rem_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input, 1, 512*ggml_element_size(input))); - - struct ggml_tensor * cat_emb = ggml_add(ctx0, seq_embd, ctx_embd); - - tok_emb = ggml_new_tensor_2d(ctx0, cat_emb->type, cat_emb->ne[0], cat_emb->ne[1]+rem_embd->ne[1]); - tok_emb = ggml_set_1d(ctx0, tok_emb, cat_emb, 0); - tok_emb = ggml_set_1d(ctx0, tok_emb, rem_embd, cat_emb->ne[0]*cat_emb->ne[1]*ggml_element_size(cat_emb)); - } else { - tok_emb = ggml_get_rows(ctx0, model->wtes[0], input); - } - } - - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = *n_past + i; - } - - // wte + wpe - struct ggml_tensor * inpL = ggml_add(ctx0, tok_emb, ggml_get_rows(ctx0, model->wpe, position)); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; - - // norm - { - // [ 768, N] - cur = ggml_norm(ctx0, inpL, EPS_NORM); - - // cur = ln_1_g*cur + ln_1_b - // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ln_1_g, cur), - cur), - ggml_repeat(ctx0, model->layers[il].ln_1_b, cur)); - } - - // attn - // [2304, 768] - model.layers[il].c_attn_attn_w - // [2304, 1] - model.layers[il].c_attn_attn_b - // [ 768, N] - cur (in) - // [2304, N] - cur (out) - // - // cur = attn_w*cur + attn_b - // [2304, N] - { - cur = ggml_mul_mat(ctx0, - model->layers[il].c_attn_attn_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model->layers[il].c_attn_attn_b, cur), - cur); - } - - // self-attention - { - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - - // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model->memory_k, N*n_embd, (ggml_element_size(model->memory_k)*n_embd)*(il*n_ctx + *n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, model->memory_v, N*n_embd, (ggml_element_size(model->memory_v)*n_embd)*(il*n_ctx + *n_past)); - - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - // [64, N, 12] - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), - 0, 2, 1, 3); - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - // [64, n_past + N, 12] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model->memory_k, (*n_past + N)*n_embd, il*n_ctx*ggml_element_size(model->memory_k)*n_embd), - n_embd/n_head, n_head, *n_past + N), - 0, 2, 1, 3); - - // GG: flash attention - //struct ggml_tensor * V = - // ggml_cpy(ctx0, - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 1, 2, 0, 3), - // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - - //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); - - // K * Q - // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = - ggml_scale_inplace(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) - ); - - // KQ_masked = mask_past(KQ_scaled) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, *n_past); - - // KQ = soft_max(KQ_masked) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - // [n_past + N, 64, 12] - struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model->memory_v, (*n_past + N)*n_embd, il*n_ctx*ggml_element_size(model->memory_v)*n_embd), - n_embd/n_head, n_head, *n_past + N), - 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model->memory_v->type, *n_past + N, n_embd/n_head, n_head)); - - // KQV = transpose(V) * KQ_soft_max - // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - - // cur = KQV_merged.contiguous().view(n_embd, N) - // [768, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - } - - // projection - // [ 768, 768] - model.layers[il].c_attn_proj_w - // [ 768, 1] - model.layers[il].c_attn_proj_b - // [ 768, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - { - cur = ggml_mul_mat(ctx0, - model->layers[il].c_attn_proj_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model->layers[il].c_attn_proj_b, cur), - cur); - } - - // add the input - cur = ggml_add(ctx0, cur, inpL); - - struct ggml_tensor * inpFF = cur; - - // feed-forward network - { - // norm - { - cur = ggml_norm(ctx0, inpFF, EPS_NORM); - - // cur = ln_2_g*cur + ln_2_b - // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ln_2_g, cur), - cur), - ggml_repeat(ctx0, model->layers[il].ln_2_b, cur)); - } - - // fully connected - // [3072, 768] - model.layers[il].c_mlp_fc_w - // [3072, 1] - model.layers[il].c_mlp_fc_b - // [ 768, N] - cur (in) - // [3072, N] - cur (out) - // - // cur = fc_w*cur + fc_b - // [3072, N] - cur = ggml_mul_mat(ctx0, - model->layers[il].c_mlp_fc_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model->layers[il].c_mlp_fc_b, cur), - cur); - - // GELU activation - // [3072, N] - cur = ggml_gelu(ctx0, cur); - - // projection - // [ 768, 3072] - model.layers[il].c_mlp_proj_w - // [ 768, 1] - model.layers[il].c_mlp_proj_b - // [3072, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - cur = ggml_mul_mat(ctx0, - model->layers[il].c_mlp_proj_w, - cur); - - cur = ggml_add(ctx0, - ggml_repeat(ctx0, model->layers[il].c_mlp_proj_b, cur), - cur); - } - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - } - - // norm - { - // [ 768, N] - inpL = ggml_norm(ctx0, inpL, EPS_NORM); - - // inpL = ln_f_g*inpL + ln_f_b - // [ 768, N] - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model->ln_f_g, inpL), - inpL), - ggml_repeat(ctx0, model->ln_f_b, inpL)); - } - - // inpL = WTE * inpL - // [ 768, 50257] - model.lm_head - // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model->lm_heads[0], inpL); - - // run the computation - ggml_build_forward_expand(&gf, inpL); - ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); - - if (logits != NULL) { - // return result just for the last token - memcpy(logits, (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); - } - - if (model->mem_per_token == 0) { - model->mem_per_token = ggml_used_mem(ctx0)/N; - } - - // updating n_past with N (-256 if merge_ctx) - if (n_past) - *n_past += N; - - ggml_free(ctx0); - - model->t_predict_us += (ggml_time_us() - t_predict_start_us); - model->n_predict += 1; - - return 0; -} - -void softmax(std::vector & logits) { - // for numerical stability - float maxl = -INFINITY; - for (const auto & l : logits) - maxl = std::max(maxl, l); - - // softmax - float sum = 0.0; - for (auto & l : logits) { - l = exp(l - maxl); - sum += l; - } - - for (auto & l : logits) - l /= sum; -} - -bark_token gpt_multinomial_sample( - std::vector & logits, - std::mt19937 & rng, - float temp, - float * eos_p) { - int n_logits = logits.size(); - - for (int i = 0; i < n_logits; ++i) - logits[i] /= temp; - - softmax(logits); - - std::discrete_distribution dist(logits.begin(), logits.end()); - int next = dist(rng); - - // likelihood of EOS token - if (eos_p) - *eos_p = logits[logits.size() - 1]; - - return next; -} - -bark_token gpt_argmax_sample(std::vector & logits, float * eos_p) { - int n_logits = logits.size(); - - // testing purposes - for (auto & l : logits) { l /= 0.7f; } - - // likelihood of EOS token - softmax(logits); - - if (eos_p) - *eos_p = logits[logits.size() - 1]; - - int next = 0; - float maxl = -INFINITY; - - for (int i = 0; i < n_logits; i++) { - if (logits[i] > maxl) { - maxl = logits[i]; - next = i; - } - } - - return next; -} - -bark_token gpt_sample( - std::vector & logits, - std::mt19937 & rng, - float temp, - float * eos_p, - int64_t * t_sample_us, - int64_t * n_sample) { - int64_t t_sample_start_us = ggml_time_us(); - - bark_token res; - if (temp == 0.0f) { - res = gpt_argmax_sample(logits, eos_p); - } else { - res = gpt_multinomial_sample(logits, rng, temp, eos_p); - } - - int64_t t_sample_end_us = ggml_time_us(); - *t_sample_us += (t_sample_end_us - t_sample_start_us); - *n_sample += 1; - - return res; -} - -void bark_tokenize_input(struct bark_context * ctx, const char * text) { - auto & model = ctx->model.text_model; - bark_vocab * vocab = &ctx->model.vocab; - - int32_t block_size = model.hparams.block_size; - int32_t max_ctx_size = std::min(block_size, 256); - int32_t n_tokens; - - bark_sequence tokens(max_ctx_size); - bert_tokenize(vocab, text, tokens.data(), &n_tokens, max_ctx_size); - - for (int i = 0; i < (int) tokens.size(); i++) - tokens[i] += TEXT_ENCODING_OFFSET; - - if (n_tokens < max_ctx_size) { - for (int i = n_tokens; i < max_ctx_size; i++) - tokens[i] = TEXT_PAD_TOKEN; - } else if (n_tokens > max_ctx_size) { - fprintf(stderr, "%s: input sequence is too long (%d > 256), truncating sequence", __func__, n_tokens); - } - - tokens.resize(max_ctx_size); - - // semantic history - for (int i = 0; i < 256; i++) - tokens.push_back(SEMANTIC_PAD_TOKEN); - tokens.push_back(SEMANTIC_INFER_TOKEN); - - assert(tokens.size() == 256 + 256 + 1); - - ctx->tokens = tokens; - - printf("%s: prompt: '%s'\n", __func__, text); - printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, ctx->tokens.size()); - for (int i = 0; i < std::min(8, (int) ctx->tokens.size()); i++) { - printf("%d ", ctx->tokens[i]); - } - printf("\n"); -} - -static void bark_print_statistics(gpt_model * model) { - printf("\n\n"); - printf("%s: mem per token = %8.2f MB\n", __func__, model->mem_per_token/1000.0f/1000.0f); - printf("%s: sample time = %8.2f ms / %lld tokens\n", __func__, model->t_sample_us/1000.0f, model->n_sample); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, model->t_predict_us/1000.0f, model->t_predict_us/model->n_predict/1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, model->t_main_us/1000.0f); - printf("\n"); -} - -void bark_forward_text_encoder(struct bark_context * ctx, int n_threads) { - const int64_t t_main_start_us = ggml_time_us(); - - bark_sequence out; - - bark_progress progress( __func__); - - gpt_model * model = &ctx->model.text_model; - - auto & hparams = model->hparams; - const int n_vocab = hparams.n_out_vocab; - - float min_eos_p = ctx->min_eos_p; - float temp = ctx->temp; - - bark_sequence input = ctx->tokens; - - std::vector logits; - logits.resize(n_vocab); - - float eos_p = 0; - - // dry run to estimate mem_per_token - { - int n_past = 0; - bark_token decoy[4] = { 0, 1, 2, 3 }; - gpt_eval(model, decoy, 4, nullptr, &n_past, false, n_threads); - } - - int n_past = 0; - - for (int i = 0; i < 768; i++) { - gpt_eval(model, input.data(), input.size(), logits.data(), &n_past, true, n_threads); - - std::vector relevant_logits(logits.begin(), logits.begin() + SEMANTIC_VOCAB_SIZE); - relevant_logits.push_back(logits[SEMANTIC_PAD_TOKEN]); - - input.clear(); - - bark_token next = gpt_sample( - logits, ctx->rng, temp, &eos_p, &model->t_sample_us, &model->n_sample); - - if (next == SEMANTIC_VOCAB_SIZE || eos_p >= min_eos_p) - break; - - input.push_back(next); - out.push_back(next); - - progress.callback((float) i/768); - } - - ctx->semantic_tokens = out; - - const int64_t t_main_end_us = ggml_time_us(); - model->t_main_us = t_main_end_us - t_main_start_us; - - bark_print_statistics(model); -} - -void bark_forward_coarse_encoder(struct bark_context * ctx, int n_threads) { - const int64_t t_main_start_us = ggml_time_us(); - - bark_codes out_coarse; - bark_sequence out; - - bark_progress progress(__func__); - - int max_coarse_history = ctx->max_coarse_history; - int sliding_window_size = ctx->sliding_window_size; - float temp = ctx->temp; - - float semantic_to_coarse_ratio = COARSE_RATE_HZ / SEMANTIC_RATE_HZ * N_COARSE_CODEBOOKS; - int max_semantic_history = floorf(max_coarse_history / semantic_to_coarse_ratio); - - int n_steps = floorf(ctx->semantic_tokens.size() * semantic_to_coarse_ratio / N_COARSE_CODEBOOKS) * N_COARSE_CODEBOOKS; - int step_ix = 0; - - BARK_ASSERT(n_steps > 0); - BARK_ASSERT(n_steps % N_COARSE_CODEBOOKS == 0); - - int n_window_steps = ceilf(static_cast(n_steps) / sliding_window_size); - - gpt_model * model = &ctx->model.coarse_model; - - auto & hparams = model->hparams; - const int n_vocab = hparams.n_out_vocab; - - bark_sequence input = ctx->semantic_tokens; - - std::vector logits; - logits.resize(n_vocab); - - // dry run to estimate mem_per_token - { - int n_past = 0; - bark_token decoy[4] = { 0, 1, 2, 3 }; - gpt_eval(model, decoy, 4, nullptr, &n_past, false, n_threads); - } - - for (int i = 0; i < n_window_steps; i++) { - int semantic_ix = roundf(n_steps / semantic_to_coarse_ratio); - - bark_sequence input_in( - input.begin() + std::max(semantic_ix-max_semantic_history, 0), - input.end() - ); - size_t original_size = input_in.size(); - input_in.resize(256); - - // padding from the right side - for (int ix = original_size; ix < 256; ix++) - input_in[ix] = COARSE_SEMANTIC_PAD_TOKEN; - - input_in.push_back(COARSE_INFER_TOKEN); - - // concatenate input_in and input_coarse - input_in.insert( - input_in.end(), - std::make_move_iterator(out.end() - std::min(max_coarse_history, (int) out.size())), - std::make_move_iterator(out.end()) - ); - - int n_past = 0; - // TODO: this is a hack, - model->mem_per_token *= 1.1; // context length is growing, mem_per_token must grow as well - - for (int j = 0; j < sliding_window_size; j++) { - if (step_ix >= n_steps) - continue; - - gpt_eval(model, input_in.data(), input_in.size(), logits.data(), &n_past, false, n_threads); - - input_in.clear(); - - bool is_major = step_ix % N_COARSE_CODEBOOKS == 0; - int start_ix = SEMANTIC_VOCAB_SIZE + (1 - is_major) * CODEBOOK_SIZE; - int end_ix = SEMANTIC_VOCAB_SIZE + (2 - is_major) * CODEBOOK_SIZE; - std::vector relevant_logits(logits.begin() + start_ix, logits.begin() + end_ix); - - bark_token next = gpt_sample( - relevant_logits, ctx->rng, temp, NULL, &model->t_sample_us, &model->n_sample); - - next += start_ix; - - input_in.push_back(next); - out.push_back(next); - - step_ix += 1; - - progress.callback((float) (i*sliding_window_size+j)/n_steps); - } - } - - BARK_ASSERT((int) out.size() == n_steps); - BARK_ASSERT(out.size() % N_COARSE_CODEBOOKS == 0); - - // out_coarse: [seq_length, n_codes] - for (int i = 0; i < (int) out.size(); i += N_COARSE_CODEBOOKS) { - // this assumes N_COARSE_CODEBOOKS = 2 - bark_sequence _tmp = { - out[i] - SEMANTIC_VOCAB_SIZE, - out[i+1] - SEMANTIC_VOCAB_SIZE - CODEBOOK_SIZE - }; - out_coarse.push_back(_tmp); - } - - ctx->coarse_tokens = out_coarse; - - const int64_t t_main_end_us = ggml_time_us(); - model->t_main_us = t_main_end_us - t_main_start_us; - - bark_print_statistics(model); - -} - -void bark_forward_fine_encoder(struct bark_context * ctx, int n_threads) { - // input shape: [N, n_codes] - const int64_t t_main_start_us = ggml_time_us(); - - bark_progress progress(__func__); - - bark_codes input = ctx->coarse_tokens; - - float temp = ctx->fine_temp; - - std::vector logits; - logits.resize(1024*1056); - - gpt_model * model = &ctx->model.fine_model; - - int n_coarse = input[0].size(); - int original_seq_len = input.size(); - int n_remove_from_end = 0; - - // channel padding - for (int i = 0; i < (int) input.size(); i++) { - for (int j = N_COARSE_CODEBOOKS; j < N_FINE_CODEBOOKS; j++) { - input[i].push_back(CODEBOOK_SIZE); - } - } - - // spatial padding if sequence is too short - if (original_seq_len < 1024) { - n_remove_from_end = 1024 - original_seq_len; - for (int i = original_seq_len; i < 1024; i++) { - bark_sequence _tmp(N_FINE_CODEBOOKS, CODEBOOK_SIZE); - input.push_back(_tmp); - } - } - - // dry run to estimate mem_per_token - bark_token decoy[16] = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - fine_gpt_eval(model, decoy, 16, nullptr, n_threads, 2); - - int n_loops = std::max(0, (int) ceilf((input.size() - 1024)/512.f)) + 1; - - // in_arr: [seq_length, n_codes] - bark_codes in_arr = input; - - for (int n = 0; n < n_loops; n++) { - int start_ix = std::min(n * 512, (int) in_arr.size() - 1024); - int start_fill_ix = std::min(n * 512, (int) in_arr.size() - 512); - int rel_start_fill_ix = start_fill_ix - start_ix; - - // in_buffer: [n_codes*seq_length] (sequences are contiguous) - bark_sequence in_buffer; - for (int i = 0; i < N_FINE_CODEBOOKS; i++) { - for (int j = start_ix; j < start_ix + 1024; j++) { - in_buffer.push_back(in_arr[j][i]); - } - } - - for (int nn = n_coarse; nn < N_FINE_CODEBOOKS; nn++) { - fine_gpt_eval(model, in_buffer.data(), in_buffer.size(), logits.data(), n_threads, nn); - - for (int i = 0; i < 1024; i++) { - std::vector relevant_logits(logits.begin() + i*1056, logits.begin() + (i+1)*1056); - relevant_logits.resize(CODEBOOK_SIZE); - - bark_token next = gpt_sample( - relevant_logits, ctx->rng, temp, NULL, &model->t_sample_us, &model->n_sample); - - in_buffer[nn*1024 + rel_start_fill_ix + i] = next; - } - - progress.callback((float) (n*(N_FINE_CODEBOOKS-n_coarse)+(nn-n_coarse))/(n_loops*(N_FINE_CODEBOOKS-n_coarse))); - } - - // transfer over info into model_in - for (int nn = n_coarse; nn < N_FINE_CODEBOOKS; nn++) { - for (int j = 0; j < CODEBOOK_SIZE - rel_start_fill_ix; j++) { - in_arr[start_fill_ix+j][nn] = in_buffer[nn*1024 + rel_start_fill_ix + j]; - } - } - - } - - if (n_remove_from_end > 0) { - in_arr.resize(in_arr.size() - n_remove_from_end); - } - - BARK_ASSERT(ctx->coarse_tokens.size() == in_arr.size()); - - ctx->fine_tokens = in_arr; - - const int64_t t_main_end_us = ggml_time_us(); - model->t_main_us = t_main_end_us - t_main_start_us; - - bark_print_statistics(model); -} - -int encodec_eval( - const bark_codes & tokens, - encodec_model & model, - audio_arr_t & audio_arr) { - // input shape: [seq_length, n_codes] - int64_t t_predict_start_us = ggml_time_us(); - - const int N = tokens.size(); - const int n_codes = tokens[0].size(); - - bark_codes input = tokens; - - static size_t buf_size = 256u*1024*1024; - static void * buf = malloc(buf_size); - - if (model.mem_per_token > 0 && model.mem_per_token*N*n_codes > buf_size) { - const size_t buf_size_new = 1.1*(model.mem_per_token*N*n_codes); // add 10% to account for ggml object overhead - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return 1; - } - } - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {}; - - struct ggml_tensor * codes = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, N, n_codes); - for (int c = 0; c < n_codes; c++) { - bark_sequence _tmp; - for (int i = 0; i < N; i++) - _tmp.push_back(input[i][c]); - int offset = ggml_element_size(codes)*c*N; - memcpy((void *) ((char *) codes->data + offset), _tmp.data(), N*ggml_element_size(codes)); - } - - struct ggml_tensor * quantized_out = encodec_quantizer_decode_eval(ctx0, model, codes); - struct ggml_tensor * output = encodec_decoder_eval(ctx0, model, quantized_out); - - ggml_build_forward_expand(&gf, output); - // TODO: adapt ggml_conv_1d and ggml_conv_trans_1d implementation to use multiple - // threads. - ggml_graph_compute_with_ctx(ctx0, &gf, 1); - - int out_seq_length = output->ne[0]; - audio_arr.resize(out_seq_length); - memcpy(audio_arr.data(), (float *) ggml_get_data(output), sizeof(float)*out_seq_length); - - if (model.mem_per_token == 0) { - model.mem_per_token = ggml_used_mem(ctx0)/N/n_codes; - } - - ggml_free(ctx0); - - model.t_predict_us += (ggml_time_us() - t_predict_start_us); - - return 0; -} - -void bark_forward_encodec(struct bark_context * ctx) { - const int64_t t_main_start_us = ggml_time_us(); - - auto & model = ctx->model.codec_model; - - // dry run to estimate mem_per_token - bark_codes toy_data; - for (int i = 0; i < 20; i++) { - bark_sequence _tmp(4, i); - toy_data.push_back(_tmp); - } - encodec_eval(toy_data, model, ctx->audio_arr); - - // actual run - encodec_eval(ctx->fine_tokens, model, ctx->audio_arr); - - const int64_t t_main_end_us = ggml_time_us(); - model.t_main_us = t_main_end_us - t_main_start_us; - - printf("\n\n"); - printf("%s: mem per token = %zu bytes\n", __func__, model.mem_per_token); - printf("%s: predict time = %8.2f ms\n", __func__, model.t_predict_us/1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, model.t_main_us/1000.0f); - printf("\n"); -} - -int write_wav_on_disk(audio_arr_t& audio_arr, std::string dest_path) { - drwav_data_format format; - format.container = drwav_container_riff; - format.format = DR_WAVE_FORMAT_IEEE_FLOAT; - format.channels = 1; - format.sampleRate = SAMPLE_RATE; - format.bitsPerSample = 32; - - drwav wav; - drwav_init_file_write(&wav, dest_path.c_str(), &format, NULL); - drwav_uint64 frames = drwav_write_pcm_frames(&wav, audio_arr.size(), audio_arr.data()); - drwav_uninit(&wav); - - fprintf(stderr, "Number of frames written = %lld.\n", frames); - - return 0; -} - -int bark_generate_audio( - struct bark_context * ctx, - const char * text, - const char * dest_wav_path, - int n_threads) { - bark_tokenize_input(ctx, text); - - bark_forward_text_encoder (ctx, n_threads); - bark_forward_coarse_encoder(ctx, n_threads); - bark_forward_fine_encoder (ctx, n_threads); - - bark_forward_encodec(ctx); - - write_wav_on_disk(ctx->audio_arr, dest_wav_path); - - return 0; -} - - -void bark_free_model(struct bark_model * model) { - delete model; -} - -void bark_free(bark_context * ctx) { - ggml_free(ctx->model.coarse_model.ctx); - ggml_free(ctx->model.fine_model.ctx); - ggml_free(ctx->model.text_model.ctx); - ggml_free(ctx->model.codec_model.ctx); - - delete ctx; -} diff --git a/bark.h b/bark.h deleted file mode 100644 index 0e3a7a5..0000000 --- a/bark.h +++ /dev/null @@ -1,164 +0,0 @@ -#ifndef BARK_H -#define BARK_H - -#include "encodec.h" - -#include -#include -#include -#include - -#ifdef BARK_SHARED -# if defined(_WIN32) && !defined(__MINGW32__) -# ifdef BARK_BUILD -# define BARK_API __declspec(dllexport) -# else -# define BARK_API __declspec(dllimport) -# endif -# else -# define BARK_API __attribute__ ((visibility ("default"))) -# endif -#else -# define BARK_API -#endif - -#define SAMPLE_RATE 24000 - -#define CLS_TOKEN_ID 101 -#define SEP_TOKEN_ID 102 - -#define TEXT_ENCODING_OFFSET 10048 -#define TEXT_PAD_TOKEN 129595 - -#define CODEBOOK_SIZE 1024 -#define N_COARSE_CODEBOOKS 2 -#define N_FINE_CODEBOOKS 8 - -#define SEMANTIC_PAD_TOKEN 10000 -#define SEMANTIC_INFER_TOKEN 129599 -#define SEMANTIC_VOCAB_SIZE 10000 -#define SEMANTIC_RATE_HZ 49.9 - -#define COARSE_RATE_HZ 75 -#define COARSE_SEMANTIC_PAD_TOKEN 12048 -#define COARSE_INFER_TOKEN 12050 - - -#ifdef __cplusplus -extern "C" { -#endif - - // - // C interface - // - - typedef int32_t bark_token; - - struct bark_context; - struct bark_progress; - - struct bark_context_params { - uint32_t seed; // RNG seed - - float temp; // Temperature for sampling (text and coarse encoders) - float fine_temp; // Temperature for sampling (fine encoder) - - float min_eos_p; // Minimum probability for EOS token (text encoder) - int sliding_window_size; // Sliding window size for coarse encoder - int max_coarse_history; // Max history for coarse encoder - }; - - struct bark_model; - struct bark_vocab; - - struct gpt_hparams; - struct gpt_layer; - struct gpt_model; - - BARK_API struct bark_context_params bark_context_default_params(void); - - BARK_API struct bark_context * bark_new_context_with_model( - struct bark_model * model, - struct bark_context_params params); - - BARK_API void bark_seed_rng(struct bark_context * ctx, int32_t seed); - - BARK_API void bark_free(struct bark_context * ctx); - - BARK_API void bark_free_model(struct bark_model * ctx); - - BARK_API int bark_generate_audio( - struct bark_context * ctx, - const char * text, - const char * dest_wav_path, - int n_threads); - - BARK_API struct bark_model * bark_load_model_from_file(const char * dirname); - - BARK_API int bark_model_quantize( - const char * fname_inp, - const char * fname_out, - ggml_ftype ftype); - - BARK_API int bark_vocab_load( - const char * fname, - bark_vocab * vocab, - int32_t expected_size); - -#ifdef __cplusplus -} -#endif - -#ifdef BARK_API_INTERNAL - - // - // Internal API for tests - // - - typedef std::vector bark_sequence; - typedef std::vector> bark_codes; - typedef std::vector audio_arr_t; - - int gpt_model_load(const std::string& fname, gpt_model& model); - - int gpt_eval( - gpt_model * model, - bark_token * tokens, - int n_tokens, - float * logits, - int * n_past, - bool merge_ctx, - int n_threads); - - bool fine_gpt_eval( - gpt_model * model, - bark_token * tokens, - int n_tokens, - float * logits, - int n_threads, - int codebook_ix); - - void bert_tokenize( - const bark_vocab * vocab, - const char * text, - int32_t * tokens, - int32_t * n_tokens, - int32_t n_max_tokens); - - void bark_forward_text_encoder( - struct bark_context * ctx, - int n_threads); - - void bark_forward_coarse_encoder( - struct bark_context * ctx, - int n_threads); - - void bark_forward_fine_encoder( - struct bark_context * ctx, - int n_threads); - - void bark_forward_encodec(struct bark_context * ctx); - -#endif // BARK_API_INTERNAL - -#endif // BARK_H diff --git a/CMakeLists.txt b/bark/CMakeLists.txt similarity index 50% rename from CMakeLists.txt rename to bark/CMakeLists.txt index 9e923a7..94d5711 100644 --- a/CMakeLists.txt +++ b/bark/CMakeLists.txt @@ -1,5 +1,5 @@ -cmake_minimum_required(VERSION 3.12) -project("bark.cpp" C CXX) +cmake_minimum_required(VERSION 3.12) +project("bark" C CXX) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) @@ -7,8 +7,7 @@ if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +set(CMAKE_CXX_FLAGS_RELEASE "-O3") if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) set(BARK_STANDALONE ON) @@ -16,34 +15,27 @@ else() set(BARK_STANDALONE OFF) endif() -option(BARK_BUILD_TESTS "bark: build tests" ${BARK_STANDALONE}) -option(BARK_BUILD_EXAMPLES "bark: build examples" ${BARK_STANDALONE}) +option(BARK_BUILD_TESTS "bark: build tests" ${BARK_STANDALONE}) +option(BARK_BUILD_EXAMPLES "bark: build examples" ${BARK_STANDALONE}) # Build libraries -add_subdirectory(ggml) +set(BARK_LIB bark) -set(BARK_LIB bark.cpp) +# add_subdirectory(../ggml ${CMAKE_BINARY_DIR}/ggml) +add_subdirectory(../encodec.cpp ${CMAKE_BINARY_DIR}/encodec.cpp) -add_library( - ${BARK_LIB} - bark - bark.cpp - bark.h - bark-util.h - encodec.cpp - encodec.h -) - -target_link_libraries(${BARK_LIB} PUBLIC ggml) -target_include_directories(${BARK_LIB} PUBLIC .) -target_compile_features(${BARK_LIB} PUBLIC cxx_std_11) +add_library(${BARK_LIB} bark.cpp bark.h) if (BARK_BUILD_EXAMPLES) add_subdirectory(examples) endif() -if (BARK_BUILD_TESTS AND NOT CMAKE_JS_VERSION) +if (BARK_BUILD_TESTS) include(CTest) add_subdirectory(tests) endif () + +target_link_libraries(${BARK_LIB} PUBLIC ggml encodec) +target_include_directories(${BARK_LIB} PUBLIC .) +target_compile_features(${BARK_LIB} PUBLIC cxx_std_11) diff --git a/bark/bark.cpp b/bark/bark.cpp new file mode 100644 index 0000000..2ab7fe8 --- /dev/null +++ b/bark/bark.cpp @@ -0,0 +1,2333 @@ +/* Port of Suno's Bark to C/C++. */ +#include "ggml.h" +#include "ggml-alloc.h" +#include "ggml-backend.h" + +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include "bark.h" +#include "encodec.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define EPS_NORM 1e-5f + + +void print_tensor(struct ggml_tensor * a) { + float sum = 0; + float maxv = -INFINITY; + float minv = INFINITY; + if (a) { + for (int i = 0; i < a->ne[3]; i++) { + for (int j = 0; j < a->ne[2]; j++) { + for (int k = 0; k < a->ne[1]; k++) { + for (int l = 0; l < a->ne[0]; l++) { + if (a->type == GGML_TYPE_F32) { + float * aval = (float *) ( + (char *) a->data + i*a->nb[3] + j*a->nb[2] + k*a->nb[1] + l*a->nb[0]); + sum += *aval; + maxv = MAX(*aval, maxv); + minv = MIN(*aval, minv); + // printf("%.4f ", *aval); + } else if (a->type == GGML_TYPE_F16) { + ggml_fp16_t * tmp = (ggml_fp16_t *) ( + (char *) a->data + i*a->nb[3] + j*a->nb[2] + k*a->nb[1] + l*a->nb[0]); + float aval = ggml_fp16_to_fp32(*tmp); + sum += aval; + maxv = MAX(aval, maxv); + minv = MIN(aval, minv); + // printf("%.4f ", aval); + } else if (a->type == GGML_TYPE_I32) { + int32_t * aval = (int32_t *) ( + (char *) a->data + i*a->nb[3] + j*a->nb[2] + k*a->nb[1] + l*a->nb[0]); + sum += (float) *aval; + maxv = MAX((float) *aval, maxv); + minv = MIN((float) *aval, minv); + // printf("%d ", *aval); + } else { + throw std::runtime_error("Wrong tensor type."); + } + } + // printf("\n"); + } + // printf("\n\n"); + } + } + printf("sum=%.2f; max=%.2f; min=%.2f\n", sum, maxv, minv); + printf("shape=[%lld, %lld, %lld, %lld]\n", a->ne[0], a->ne[1], a->ne[2], a->ne[3]); + } +} + +class BarkProgressBar { + public: + BarkProgressBar(std::string func_name, double needed_progress) { + this->func_name = func_name; + this->needed_progress = needed_progress; + } + + void update(double new_progress) { + current_progress += new_progress; + amount_of_filler = (int)((current_progress / needed_progress)*(double)pbar_length); + } + void print() { + printf("\r%s: %s", func_name.c_str(), initial_part.c_str()); + for (int a = 0; a < amount_of_filler; a++) { + printf("%s", pbar_filler.c_str()); + } + printf("%s", pbar_updater.c_str()); + for (int b = 0; b < pbar_length - amount_of_filler; b++) { + printf(" "); + } + printf("%s (%d%%)", last_part.c_str(), (int)(100*(current_progress/needed_progress))); + fflush(stdout); + } + + std::string initial_part = "[", last_part = "]"; + std::string pbar_filler = "=", pbar_updater = ">"; + + private: + std::string func_name; + double needed_progress, current_progress = 0; + int amount_of_filler, pbar_length = 50; +}; + +template +static void read_safe(std::ifstream& fin, T& dest) { + fin.read((char*)& dest, sizeof(T)); +} + +template +static void write_safe(std::ofstream& fout, T& dest) { + fout.write((char*)& dest, sizeof(T)); +} + +static void bark_print_statistics(gpt_model * model) { + printf("\n\n"); + printf("%s: sample time = %8.2f ms / %lld tokens\n", __func__, model->t_sample_us/1000.0f, model->n_sample); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, model->t_predict_us/1000.0f, model->t_predict_us/model->n_sample/1000.0f); + printf("%s: total time = %8.2f ms\n", __func__, model->t_main_us/1000.0f); + printf("\n"); +} + +static void softmax(std::vector & logits) { + // for numerical stability + float maxl = -INFINITY; + for (const auto & l : logits) + maxl = std::max(maxl, l); + + // softmax + float sum = 0.0; + for (auto & l : logits) { + l = exp(l - maxl); + sum += l; + } + + for (auto & l : logits) + l /= sum; +} + +static bark_token gpt_multinomial_sample( + std::vector & logits, + std::mt19937 & rng, + float temp, + float * eos_p) { + int n_logits = logits.size(); + + for (int i = 0; i < n_logits; ++i) + logits[i] /= temp; + + softmax(logits); + + std::discrete_distribution dist(logits.begin(), logits.end()); + int next = dist(rng); + + // likelihood of EOS token + if (eos_p) + *eos_p = logits[logits.size() - 1]; + + return next; +} + +static bark_token gpt_argmax_sample(std::vector & logits, float * eos_p) { + int n_logits = logits.size(); + + // testing purposes + for (auto & l : logits) { l /= 0.7f; } + + // likelihood of EOS token + softmax(logits); + + if (eos_p) + *eos_p = logits[logits.size() - 1]; + + int next = 0; + float maxl = -INFINITY; + + for (int i = 0; i < n_logits; i++) { + if (logits[i] > maxl) { + maxl = logits[i]; + next = i; + } + } + + return next; +} + +static bark_token gpt_sample( + std::vector & logits, + std::mt19937 & rng, + float temp, + float * eos_p, + int64_t * t_sample_us, + int64_t * n_sample) { + int64_t t_sample_start_us = ggml_time_us(); + + bark_token res; + if (temp == 0.0f) { + res = gpt_argmax_sample(logits, eos_p); + } else { + res = gpt_multinomial_sample(logits, rng, temp, eos_p); + } + + int64_t t_sample_end_us = ggml_time_us(); + *t_sample_us += (t_sample_end_us - t_sample_start_us); + *n_sample += 1; + + return res; +} + +bool bark_vocab_load( + const std::string & fname, + bark_vocab * vocab, + int32_t expected_size) { + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + int32_t n_vocab; + read_safe(fin, n_vocab); + + // 5 special tokens: [UNK, SEP, MASK, PAD, CLS] + if (n_vocab != expected_size) { + fprintf(stderr, "%s: wrong voculary size (%d != %d)\n", __func__, n_vocab, expected_size); + return false; + } + + std::string word; + std::vector tmp; + + tmp.reserve(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + read_safe(fin, len); + + if (len > 0) { + tmp.resize(len); + fin.read(&tmp[0], tmp.size()); // read to buffer + word.assign(&tmp[0], tmp.size()); + } else { + word = ""; + } + + vocab->token_to_id[word] = i; + vocab->id_to_token[i] = word; + } + + return true; +} + +static size_t utf8_len(char src) { + const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}; + uint8_t highbits = static_cast(src) >> 4; + return lookup[highbits]; +} + +static std::string strip_accents(const std::string & in_str) { + std::string out_str; + std::map accent_map = {{"À", 'A'},{"Á", 'A'}, + {"Â", 'A'},{"Ã", 'A'},{"Ä", 'A'},{"Å", 'A'},{"à", 'a'},{"á", 'a'}, + {"â", 'a'},{"ã", 'a'},{"ä", 'a'},{"å", 'a'},{"È", 'E'},{"É", 'E'}, + {"Ê", 'E'},{"Ë", 'E'},{"è", 'e'},{"é", 'e'},{"ê", 'e'},{"ë", 'e'}, + {"Ì", 'I'},{"Í", 'I'},{"Î", 'I'},{"Ï", 'I'},{"ì", 'i'},{"í", 'i'}, + {"î", 'i'},{"ï", 'i'},{"Ò", 'O'},{"Ó", 'O'},{"Ô", 'O'},{"Õ", 'O'}, + {"Ö", 'O'},{"ò", 'o'},{"ó", 'o'},{"ô", 'o'},{"õ", 'o'},{"ö", 'o'}, + {"Ù", 'U'},{"Ú", 'U'},{"Û", 'U'},{"Ü", 'U'},{"ù", 'u'},{"ú", 'u'}, + {"û", 'u'},{"ü", 'u'},{"Ý", 'Y'},{"ý", 'y'},{"Ç", 'C'},{"ç", 'c'}, + {"Ñ", 'N'},{"ñ", 'n'}, + }; + + for (size_t i = 0; i < in_str.length();) { + int len = utf8_len(in_str[i]); + std::string cur = in_str.substr(i, len); + auto iter = accent_map.find(cur); + if (iter != accent_map.end()) + out_str += iter->second; + else + out_str += cur; + + i += len; + } + + return out_str; +} + +void bert_tokenize( + const bark_vocab * vocab, + const char * text, + int32_t * tokens, + int32_t * n_tokens, + int32_t n_max_tokens) { + std::string str = text; + std::vector words; + + int32_t t = 0; + + auto * token_map = &vocab->token_to_id; + + // split the text into words + { + str = strip_accents(text); + + std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)"; + + std::regex re(pat); + std::smatch m; + + while (std::regex_search(str, m, re)) { + for (std::string x : m) + words.push_back(x); + str = m.suffix(); + } + } + + // apply wordpiece + for (const auto &word : words) { + if (word.size() == 0) + continue; + + std::string prefix = ""; + int i = 0; + int n = word.size(); + + loop: + while (i < n) { + if (t >= n_max_tokens - 1) + break; + int j = n; + while (j > i) { + auto it = token_map->find(prefix + word.substr(i, j - i)); + if (it != token_map->end()) { + tokens[t++] = it->second; + i = j; + prefix = "##"; + goto loop; + } + --j; + } + if (j == i) { + fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + prefix = "##"; + ++i; + } + } + } + + *n_tokens = t; +} + +static void bark_tokenize_input(struct bark_context * bctx, const std::string & text) { + auto & model = bctx->model.text_model; + bark_vocab * vocab = &bctx->model.vocab; + + auto & params = bctx->params; + + int32_t block_size = model.hparams.block_size; + int32_t max_ctx_size = std::min(block_size, 256); + int32_t n_tokens; + + bark_sequence tokens(max_ctx_size); + bert_tokenize(vocab, text.data(), tokens.data(), &n_tokens, max_ctx_size); + + for (int i = 0; i < (int) tokens.size(); i++) + tokens[i] += params.text_encoding_offset; + + if (n_tokens < max_ctx_size) { + for (int i = n_tokens; i < max_ctx_size; i++) + tokens[i] = params.text_pad_token; + } else if (n_tokens > max_ctx_size) { + fprintf(stderr, "%s: input sequence is too long (%d > 256), truncating sequence", __func__, n_tokens); + } + + tokens.resize(max_ctx_size); + + // semantic history + for (int i = 0; i < 256; i++) + tokens.push_back(params.semantic_pad_token); + tokens.push_back(params.semantic_infer_token); + + assert(tokens.size() == 256 + 256 + 1); + + bctx->tokens = tokens; + + printf("%s: prompt: '%s'\n", __func__, text.c_str()); + printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, bctx->tokens.size()); + for (int i = 0; i < std::min(8, (int) bctx->tokens.size()); i++) { + printf("%d ", bctx->tokens[i]); + } + printf("\n\n"); +} + +static bool gpt_load_model_weights( + const std::string & fname, + gpt_model & model, + int n_gpu_layers, + bark_verbosity_level verbosity) { + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str()); + } + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + read_safe(fin, hparams.n_layer); + read_safe(fin, hparams.n_head); + read_safe(fin, hparams.n_embd); + read_safe(fin, hparams.block_size); + read_safe(fin, hparams.bias); + read_safe(fin, hparams.n_in_vocab); + read_safe(fin, hparams.n_out_vocab); + read_safe(fin, hparams.n_lm_heads); + read_safe(fin, hparams.n_wtes); + read_safe(fin, hparams.ftype); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + printf("%s: n_in_vocab = %d\n", __func__, hparams.n_in_vocab); + printf("%s: n_out_vocab = %d\n", __func__, hparams.n_out_vocab); + printf("%s: block_size = %d\n", __func__, hparams.block_size); + printf("%s: bias = %d\n", __func__, hparams.bias); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: n_lm_heads = %d\n", __func__, hparams.n_lm_heads); + printf("%s: n_wtes = %d\n", __func__, hparams.n_wtes); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + } + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return false; + } + + auto & ctx = model.ctx; + + size_t buffer_size = 0; + size_t n_tensors = 0; + + // Evaluating context size + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int block_size = hparams.block_size; + const int n_in_vocab = hparams.n_in_vocab; + const int n_out_vocab = hparams.n_out_vocab; + const int n_lm_heads = hparams.n_lm_heads; + const int n_wtes = hparams.n_wtes; + const int bias = hparams.bias; + + buffer_size += n_embd * ggml_type_size(GGML_TYPE_F32); // ln_f_g + + buffer_size += n_wtes * n_in_vocab * n_embd * ggml_type_size(wtype); // wtes + buffer_size += block_size * n_embd * ggml_type_size(GGML_TYPE_F32); // wpe + buffer_size += n_lm_heads * n_out_vocab * n_embd * ggml_type_size(wtype); // lm_head + + buffer_size += n_layer * (n_embd * ggml_type_size(GGML_TYPE_F32)); // ln_1_g + buffer_size += n_layer * (n_embd * ggml_type_size(GGML_TYPE_F32)); // ln_2_g + + buffer_size += n_layer * (3 * n_embd * n_embd * ggml_type_size(wtype)); // c_attn_attn_w + buffer_size += n_layer * ( n_embd * n_embd * ggml_type_size(wtype)); // c_attn_proj_w + + buffer_size += n_layer * (4 * n_embd * n_embd * ggml_type_size(wtype)); // c_mlp_fc_w + buffer_size += n_layer * (4 * n_embd * n_embd * ggml_type_size(wtype)); // c_mlp_proj_w + + if (bias) { + buffer_size += n_embd * ggml_type_size(GGML_TYPE_F32); // ln_f_b + + buffer_size += n_layer * (n_embd * ggml_type_size(GGML_TYPE_F32)); // ln_1_b + buffer_size += n_layer * (n_embd * ggml_type_size(GGML_TYPE_F32)); // ln_2_b + + buffer_size += n_layer * (3 * n_embd * ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b + buffer_size += n_layer * ( n_embd * ggml_type_size(GGML_TYPE_F32)); // c_attn_proj_b + + buffer_size += n_layer * (4 * n_embd * ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b + buffer_size += n_layer * ( n_embd * ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b + } + + buffer_size += 10ull*MB; // object overhead + + n_tensors = ( + 1 + // ln_f_g + n_wtes + 1 + // wtes, wpe + 2 * n_layer + // ln_1_g, ln_2_g + 2 * n_layer + // c_attn_attn_w, c_attn_proj_w + 2 * n_layer + // c_mlp_fc_w, c_mlp_proj_w + n_lm_heads + // lm_head + 2 // memory_k, memory_v + ); + + if (bias) { + n_tensors += 1; // ln_f_b + n_tensors += 2 * n_layer; // ln_1_b, ln_2_b + n_tensors += 4 * n_layer; // c_attn_attn_b, c_attn_proj_b, c_mlp_fc_b, c_mlp_proj_b + } + + if (verbosity == bark_verbosity_level::HIGH) { + printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + printf("%s: ggml ctx size = %6.2f MB\n", __func__, buffer_size/(1024.0*1024.0)); + } + } + + // create the ggml context + { + struct ggml_init_params params = { + /*.mem_size =*/ ggml_tensor_overhead() * n_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > 0) { + fprintf(stderr, "%s: using CUDA backend\n", __func__); + model.backend = ggml_backend_cuda_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } + } +#endif + +#ifdef GGML_USE_METAL + if (n_gpu_layers > 0) { + fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_metal_log_set_callback(ggml_log_callback_default, nullptr); + model.backend = ggml_backend_metal_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } + } +#endif + + if (!model.backend) { + // fallback to CPU backend + if (verbosity == bark_verbosity_level::HIGH) { + fprintf(stderr, "%s: no backend specified, using CPU backend\n", __func__); + } + model.backend = ggml_backend_cpu_init(); + } + + if (!model.backend) { + if (verbosity == bark_verbosity_level::HIGH) { + fprintf(stderr, "%s: failed to initialize CPU backend\n", __func__); + } + + return false; + } + + // allocate weights buffer + model.buffer_w = ggml_backend_alloc_buffer(model.backend, buffer_size); + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int block_size = hparams.block_size; + const int n_in_vocab = hparams.n_in_vocab; + const int n_out_vocab = hparams.n_out_vocab; + const int n_lm_heads = hparams.n_lm_heads; + const int n_wtes = hparams.n_wtes; + const int bias = hparams.bias; + + model.layers.resize(n_layer); + model.lm_heads.resize(n_lm_heads); + model.wtes.resize(n_wtes); + + model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + if (bias) { + model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + } + + model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, block_size); + + for (int i = 0; i < n_wtes; i++) { + model.wtes[i] = ggml_new_tensor_2d(ctx, wtype, n_embd, n_in_vocab); + model.tensors["model/wte/" + std::to_string(i)] = model.wtes[i]; + } + + for (int i = 0; i < n_lm_heads; i++) { + model.lm_heads[i] = ggml_new_tensor_2d(ctx, wtype, n_embd, n_out_vocab); + model.tensors["model/lm_head/" + std::to_string(i)] = model.lm_heads[i]; + } + + model.tensors["model/ln_f/g"] = model.ln_f_g; + model.tensors["model/ln_f/b"] = model.ln_f_b; + + model.tensors["model/wpe"] = model.wpe; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + + layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + + if (bias) { + layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + } + + // map by name + model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; + model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; + + model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; + model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; + } + } + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int block_size = hparams.block_size; + + const int n_lm_heads = hparams.n_lm_heads; + const int n_wtes = hparams.n_wtes; + + const int n_mem = n_layer*block_size; + const int n_elements = n_embd*n_mem; + + if (n_lm_heads == 1 && n_wtes == 1) { + // hack: if one LM head and one token embedding layer, we are loading weights + // of the text and coarse encoder. In this case, we need KV cache. + // for fine encoder, no need for KV cache, skip this part. + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + if (verbosity == bark_verbosity_level::HIGH) { + printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + // create a backend buffer (can be in host or device memory) + model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256); + + // allocate the tensors into the backend buffer + { + ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv); + + // this updates the pointers in the tensors to point to the correct location in the buffer + // this is necessary since the ggml_context is .no_alloc == true + // note that the buffer can actually be a device buffer, depending on the backend + ggml_allocr_alloc(alloc, model.memory_k); + ggml_allocr_alloc(alloc, model.memory_v); + + ggml_allocr_free(alloc); + } + } + } + + // load weights + { + ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w); + + size_t total_size = 0; + + std::vector read_buf; + + while(true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + read_safe(fin, n_dims); + read_safe(fin, length); + read_safe(fin, ttype); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + read_safe(fin, ne[i]); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name.data()) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return false; + } + + auto tensor = model.tensors[name]; + ggml_set_name(tensor, name.c_str()); + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + return false; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return false; + } + + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + ggml_allocr_alloc(alloc, tensor); + + if (ggml_backend_is_cpu(model.backend) +#ifdef GGML_USE_METAL + || ggml_backend_is_metal(model.backend) +#endif + ) { + // for the CPU and Metal backends, we can read directly into the device memory + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(ggml_nbytes(tensor)); + fin.read(read_buf.data(), ggml_nbytes(tensor)); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor)); + } + + if (verbosity == bark_verbosity_level::HIGH) { + printf("%48s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], "float", ggml_nbytes(tensor)/1024.0/1024.0); + } + + total_size += ggml_nbytes(tensor); + } + + ggml_allocr_free(alloc); + + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + + model.memsize = total_size; + } + + fin.close(); + + return true; +} + +static bool ggml_quantize_weights( + std::ifstream & fin, + std::ofstream & fout, + const ggml_ftype ftype, + const std::vector & to_quant, + const std::vector & to_skip) { + ggml_type qtype = GGML_TYPE_F32; + + switch (ftype) { + case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break; + case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; + case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_UNKNOWN: + case GGML_FTYPE_ALL_F32: + case GGML_FTYPE_MOSTLY_F16: + case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: + case GGML_FTYPE_MOSTLY_Q2_K: + case GGML_FTYPE_MOSTLY_Q3_K: + case GGML_FTYPE_MOSTLY_Q4_K: + case GGML_FTYPE_MOSTLY_Q5_K: + case GGML_FTYPE_MOSTLY_Q6_K: + { + fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); + return false; + } + }; + + if (!ggml_is_quantized(qtype)) { + fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype)); + return false; + } + + size_t total_size_org = 0; + size_t total_size_new = 0; + + std::vector work; + + std::vector data_u8; + std::vector data_f16; + std::vector data_f32; + + std::vector hist_all(1 << 4, 0); + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + read_safe(fin, n_dims); + read_safe(fin, length); + read_safe(fin, ttype); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[4] = { 1, 1, 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + read_safe(fin, ne[i]); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype)); + + bool quantize = false; + + // check if we should quantize this tensor + for (const auto & s : to_quant) { + if (std::regex_match(name, std::regex(s))) { + quantize = true; + break; + } + } + + // check if we should skip this tensor + for (const auto & s : to_skip) { + if (std::regex_match(name, std::regex(s))) { + quantize = false; + break; + } + } + + // quantize only 2D tensors + quantize &= (n_dims == 2); + + if (quantize) { + if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) { + fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); + return false; + } + + if (ttype == GGML_TYPE_F16) { + data_f16.resize(nelements); + fin.read(reinterpret_cast(data_f16.data()), nelements * sizeof(ggml_fp16_t)); + data_f32.resize(nelements); + for (int i = 0; i < nelements; ++i) { + data_f32[i] = ggml_fp16_to_fp32(data_f16[i]); + } + } else { + data_f32.resize(nelements); + fin.read(reinterpret_cast(data_f32.data()), nelements * sizeof(float)); + } + + ttype = qtype; + } else { + const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t); + + data_u8.resize(nelements*bpe); + fin.read(reinterpret_cast(data_u8.data()), nelements * bpe); + } + + write_safe(fout, n_dims); + write_safe(fout, length); + write_safe(fout, ttype); + for (int i = 0; i < n_dims; ++i) { + write_safe(fout, ne[i]); + } + fout.write(&name[0], length); + + if (quantize) { + work.resize(nelements); // for quantization + + size_t cur_size = 0; + std::vector hist_cur(1 << 4, 0); + + switch ((ggml_type) ttype) { + case GGML_TYPE_Q4_0: + { + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q4_1: + { + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q5_0: + { + cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q5_1: + { + cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q8_0: + { + cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_Q8_K: + case GGML_TYPE_COUNT: + { + fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); + return false; + } + } + + fout.write(reinterpret_cast(work.data()), cur_size); + total_size_new += cur_size; + + printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); + for (int i = 0; i < (int) hist_cur.size(); ++i) { + hist_all[i] += hist_cur[i]; + } + + for (int i = 0; i < (int) hist_cur.size(); ++i) { + printf("%5.3f ", hist_cur[i] / (float)nelements); + } + printf("\n"); + } else { + printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); + fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); + total_size_new += data_u8.size(); + } + + total_size_org += nelements * sizeof(float); + } + + printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); + printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype)); + + { + int64_t sum_all = 0; + for (int i = 0; i < (int) hist_all.size(); ++i) { + sum_all += hist_all[i]; + } + + printf("%s: hist: ", __func__); + for (int i = 0; i < (int) hist_all.size(); ++i) { + printf("%5.3f ", hist_all[i] / (float)sum_all); + } + printf("\n"); + } + + return true; +} + +static struct ggml_cgraph * bark_build_gpt_graph( + gpt_model * model, + ggml_allocr * allocr, + bark_sequence & tokens, + int * n_past, + bool merge_ctx, + int n_threads) { + if (!n_past) { + fprintf(stderr, "%s: n_past is null\n", __func__); + return NULL; + } + + int N = tokens.size(); + + const auto & hparams = model->hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.block_size; + const int n_head = hparams.n_head; + const int n_vocab = hparams.n_out_vocab; + const int bias = hparams.bias; + + static size_t buf_size = ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params ggml_params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, + }; + + struct ggml_context * ctx0 = ggml_init(ggml_params); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * input = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, input); + + // avoid writing to tensors if we are only measuring the memory usage + if (!ggml_allocr_is_measure(allocr)) { + ggml_backend_tensor_set(input, tokens.data(), 0, N*ggml_element_size(input)); + } + + struct ggml_tensor * tok_emb; + + if (*n_past > 0) { + assert(N == 1); + tok_emb = ggml_get_rows(ctx0, model->wtes[0], input); + } else { + if (merge_ctx) { + assert(N == 256+256+1); + N -= 256; + } else { + assert(N <= n_ctx); + } + + if (merge_ctx) { + struct ggml_tensor * seq_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input, 256, 0)); + struct ggml_tensor * ctx_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input, 256, 256*ggml_element_size(input))); + struct ggml_tensor * rem_embd = ggml_get_rows(ctx0, model->wtes[0], ggml_view_1d(ctx0, input, 1, 512*ggml_element_size(input))); + + struct ggml_tensor * cat_emb = ggml_add(ctx0, seq_embd, ctx_embd); + + tok_emb = ggml_new_tensor_2d(ctx0, cat_emb->type, cat_emb->ne[0], cat_emb->ne[1]+rem_embd->ne[1]); + ggml_allocr_alloc(allocr, tok_emb); + + tok_emb = ggml_set_1d(ctx0, tok_emb, cat_emb, 0); + tok_emb = ggml_set_1d(ctx0, tok_emb, rem_embd, cat_emb->ne[0]*cat_emb->ne[1]*ggml_element_size(cat_emb)); + } else { + tok_emb = ggml_get_rows(ctx0, model->wtes[0], input); + } + } + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, position); + if (!ggml_allocr_is_measure(allocr)) { + for (int i = 0; i < N; ++i) { + int32_t v = *n_past + i; + ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v)); + } + } + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(allocr, KQ_scale); + if (!ggml_allocr_is_measure(allocr)) { + float s = 1.0f/sqrtf(float(n_embd)/n_head); + ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s)); + } + + // wte + wpe + struct ggml_tensor * inpL = ggml_add(ctx0, tok_emb, ggml_get_rows(ctx0, model->wpe, position)); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // norm + { + cur = ggml_norm(ctx0, inpL, EPS_NORM); + + // cur = ln_1_g*cur + ln_1_b + cur = ggml_mul(ctx0, cur, model->layers[il].ln_1_g); + + if (bias) { + cur = ggml_add(ctx0, cur, model->layers[il].ln_1_b); + } + } + + // attn + { + cur = ggml_mul_mat(ctx0, + model->layers[il].c_attn_attn_w, + cur); + + if (bias) { + cur = ggml_add(ctx0, cur, model->layers[il].c_attn_attn_b); + } + } + + // self-attention + { + struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model->memory_k, N*n_embd, (ggml_element_size(model->memory_k)*n_embd)*(il*n_ctx + *n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx0, model->memory_v, N*n_embd, (ggml_element_size(model->memory_v)*n_embd)*(il*n_ctx + *n_past)); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model->memory_k, (*n_past + N)*n_embd, il*n_ctx*ggml_element_size(model->memory_k)*n_embd), + n_embd/n_head, n_head, *n_past + N), + 0, 2, 1, 3); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, *n_past); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + + struct ggml_tensor * V_trans = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model->memory_v, (*n_past + N)*n_embd, il*n_ctx*ggml_element_size(model->memory_v)*n_embd), + n_embd/n_head, n_head, *n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, model->memory_v->type, *n_past + N, n_embd/n_head, n_head)); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + } + + // projection + { + cur = ggml_mul_mat(ctx0, model->layers[il].c_attn_proj_w, cur); + + if (bias) { + cur = ggml_add(ctx0, cur, model->layers[il].c_attn_proj_b); + } + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + { + // norm + { + cur = ggml_norm(ctx0, inpFF, EPS_NORM); + + // cur = ln_2_g*cur + ln_2_b + cur = ggml_mul(ctx0, cur, model->layers[il].ln_2_g); + + if (bias) { + cur = ggml_add(ctx0, cur, model->layers[il].ln_2_b); + } + } + + // cur = fc_w*cur + fc_b + cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_fc_w, cur); + + if (bias) { + cur = ggml_add(ctx0, cur, model->layers[il].c_mlp_fc_b); + } + + cur = ggml_gelu(ctx0, cur); + + // projection + cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_proj_w, cur); + + if (bias) { + cur = ggml_add(ctx0, cur, model->layers[il].c_mlp_proj_b); + } + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + } + + // norm + { + inpL = ggml_norm(ctx0, inpL, EPS_NORM); + + // inpL = ln_f_g*inpL + ln_f_b + inpL = ggml_mul(ctx0, inpL, model->ln_f_g); + + if (bias) { + inpL = ggml_add(ctx0, inpL, model->ln_f_b); + } + } + + inpL = ggml_mul_mat(ctx0, + model->lm_heads[0], + ggml_view_1d(ctx0, inpL, inpL->ne[0], (inpL->ne[1]-1)*inpL->nb[1])); + + ggml_build_forward_expand(gf, inpL); + + ggml_free(ctx0); + + return gf; +} + +static ggml_cgraph * bark_build_fine_gpt_graph( + gpt_model * model, + ggml_allocr * allocr, + bark_sequence & tokens, + int codebook_idx, + int n_fine_codebooks, + int n_threads) { + // tokens: [n_channels, N] + const int N = tokens.size() / n_fine_codebooks; + const int n_channels = n_fine_codebooks; + + const auto & hparams = model->hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.block_size; + const int n_head = hparams.n_head; + + const int n_codes_given = hparams.n_codes_given; + + assert(N <= n_ctx); + assert(codebook_idx > 0); + + static size_t buf_size = ggml_tensor_overhead() * GGML_MAX_NODES + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params ggml_params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, + }; + + struct ggml_context * ctx0 = ggml_init(ggml_params); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * input = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, N, n_channels); + ggml_allocr_alloc(allocr, input); + + struct ggml_tensor * tok_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N); + ggml_allocr_alloc(allocr, tok_emb); + + if (!ggml_allocr_is_measure(allocr)) { + ggml_backend_tensor_set(input, tokens.data(), 0, N*n_channels*ggml_element_size(input)); + ggml_set_zero(tok_emb); + } + + for (int wte_ix = 0; wte_ix < codebook_idx + 1; wte_ix++) { + struct ggml_tensor * cur = ggml_get_rows(ctx0, + model->wtes[wte_ix], + ggml_view_1d(ctx0, input, N, wte_ix*input->nb[1])); + + tok_emb = ggml_add(ctx0, tok_emb, cur); + } + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_allocr_alloc(allocr, position); + if (!ggml_allocr_is_measure(allocr)) { + for (int i = 0; i < N; ++i) { + ggml_backend_tensor_set(position, &i, i*sizeof(int32_t), sizeof(int32_t)); + } + } + + struct ggml_tensor * pos_emb = ggml_get_rows(ctx0, model->wpe, position); + + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_allocr_alloc(allocr, KQ_scale); + if (!ggml_allocr_is_measure(allocr)) { + float s = 1.0f/sqrtf(float(n_embd)/n_head); + ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s)); + } + + // wte + wpe + struct ggml_tensor * inpL = ggml_add(ctx0, tok_emb, pos_emb); + + for (int il = 0; il < n_layer; il++) { + struct ggml_tensor * cur; + + // norm + { + cur = ggml_norm(ctx0, inpL, EPS_NORM); + + // cur = ln_1_g*cur + ln_1_b + cur = ggml_mul(ctx0, cur, model->layers[il].ln_1_g); + cur = ggml_add(ctx0, cur, model->layers[il].ln_1_b); + } + + // self-attention + { + // cur = attn_w*cur + cur = ggml_mul_mat(ctx0, model->layers[il].c_attn_attn_w, cur); + + struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Kcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + 0, 2, 1, 3); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled); + + struct ggml_tensor * V_trans = + ggml_cont(ctx0, + ggml_permute(ctx0, + ggml_cpy(ctx0, + Vcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + 1, 2, 0, 3)); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + // [n_embd, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // cur = proj_w*cur + cur = ggml_mul_mat(ctx0, model->layers[il].c_attn_proj_w, cur); + } + + // residual connection + cur = ggml_add(ctx0, cur, inpL); + + struct ggml_tensor * inpFF = cur; + + // feed-forward + { + // norm + { + cur = ggml_norm(ctx0, inpFF, EPS_NORM); + + cur = ggml_mul(ctx0, cur, model->layers[il].ln_2_g); + cur = ggml_add(ctx0, cur, model->layers[il].ln_2_b); + } + + // cur = fc_w*cur + cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_fc_w, cur); + + // GELU activation + cur = ggml_gelu(ctx0, cur); + + // cur = proj_w*cur + cur = ggml_mul_mat(ctx0, model->layers[il].c_mlp_proj_w, cur); + } + + inpL = ggml_add(ctx0, cur, inpFF); + } + + // norm + { + inpL = ggml_norm(ctx0, inpL, EPS_NORM); + + inpL = ggml_mul(ctx0, inpL, model->ln_f_g); + inpL = ggml_add(ctx0, inpL, model->ln_f_b); + } + + // inpL = WTE * inpL + struct ggml_tensor * lm_head = model->lm_heads[codebook_idx - n_codes_given]; + inpL = ggml_mul_mat(ctx0, lm_head, inpL); + + ggml_build_forward_expand(gf, inpL); + + ggml_free(ctx0); + + return gf; +} + +static bool bark_eval_encoder_internal( + gpt_model & model, + ggml_allocr * allocr, + bark_sequence & input, + std::vector & logits, + int * n_past, + bool merge_ctx, + int n_threads) { + auto & hparams = model.hparams; + const int n_vocab = hparams.n_out_vocab; + + const int64_t t_predict_us_start = ggml_time_us(); + + // reset the allocator to free all the memory allocated during the previous inference + ggml_allocr_reset(allocr); + + struct ggml_cgraph * gf = bark_build_gpt_graph( + &model, allocr, input, n_past, merge_ctx, n_threads); + + // allocate tensors + ggml_allocr_alloc_graph(allocr, gf); + + // run the computation + if (ggml_backend_is_cpu(model.backend)) { + ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(model.backend)) { + ggml_backend_metal_set_n_cb(model.backend, n_threads); + } +#endif + ggml_backend_graph_compute(model.backend, gf); + + struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; + + int N = input.size(); + if (merge_ctx && *n_past == 0) { + N -= 256; + } + + logits.resize(n_vocab); + ggml_backend_tensor_get(inpL, logits.data(), 0, sizeof(float)*n_vocab); + + // updating n_past with N (-256 if merge_ctx) + if (n_past) { + *n_past += N; + } + + model.t_predict_us += ggml_time_us() - t_predict_us_start; + + return true; + +} + +static bool bark_eval_fine_encoder_internal( + struct bark_context * bctx, + bark_sequence & input, + std::vector & logits, + int n_threads, + int nn) { + auto & model = bctx->model.fine_model; + auto & allocr = bctx->allocr; + auto & hparams = model.hparams; + auto & params = bctx->params; + + const int n_vocab = hparams.n_out_vocab; + const int block_size = hparams.block_size; + + const int n_fine_codebooks = params.n_fine_codebooks; + + const int64_t t_predict_us_start = ggml_time_us(); + + // reset the allocator to free all the memory allocated during the previous inference + ggml_allocr_reset(allocr); + + struct ggml_cgraph * gf = bark_build_fine_gpt_graph( + &model, allocr, input, nn, n_fine_codebooks, n_threads); + + // allocate tensors + ggml_allocr_alloc_graph(allocr, gf); + + // run the computation + if (ggml_backend_is_cpu(model.backend)) { + ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(model.backend)) { + ggml_backend_metal_set_n_cb(model.backend, n_threads); + } +#endif + ggml_backend_graph_compute(model.backend, gf); + + struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; + + ggml_backend_tensor_get(inpL, logits.data(), 0, sizeof(float)*n_vocab*block_size); + + model.t_predict_us += ggml_time_us() - t_predict_us_start; + + return true; +} + +static bool bark_eval_text_encoder(struct bark_context * bctx, int n_threads) { + bark_sequence input = bctx->tokens; + bark_sequence output; + + auto & params = bctx->params; + + int32_t n_steps_text_encoder = params.n_steps_text_encoder; + int32_t semantic_vocab_size = params.semantic_vocab_size; + int32_t semantic_pad_token = params.semantic_pad_token; + + BarkProgressBar progress(std::string("Generating semantic tokens"), n_steps_text_encoder); + + auto & model = bctx->model.text_model; + auto & allocr = bctx->allocr; + auto & hparams = model.hparams; + + const int n_vocab = hparams.n_out_vocab; + + float min_eos_p = bctx->params.min_eos_p; + float temp = bctx->params.temp; + + std::vector logits; + logits.resize(n_vocab); + + float eos_p = 0; + int n_past = 0; + + for (int i = 0; i < n_steps_text_encoder; i++) { + if (!bark_eval_encoder_internal(model, allocr, input, logits, &n_past, true, n_threads)) { + fprintf(stderr, "%s: Could not generate token\n", __func__); + return false; + } + + std::vector relevant_logits(logits.begin(), logits.begin() + semantic_vocab_size); + relevant_logits.push_back(logits[semantic_pad_token]); + + input.clear(); + + bark_token next = gpt_sample( + logits, bctx->rng, temp, &eos_p, &model.t_sample_us, &model.n_sample); + + if (next == semantic_vocab_size || eos_p >= min_eos_p) { + break; + } + + input.push_back(next); + output.push_back(next); + + progress.update(1); + progress.print(); + } + + bctx->semantic_tokens = output; + + return true; +} + +static bool bark_eval_coarse_encoder(struct bark_context * bctx, int n_threads) { + bark_codes out_coarse; + bark_sequence out; + + bark_sequence input = bctx->semantic_tokens; + + auto & model = bctx->model.coarse_model; + auto & allocr = bctx->allocr; + auto & hparams = model.hparams; + auto & params = bctx->params; + + const int n_vocab = hparams.n_out_vocab; + + std::vector logits; + logits.resize(n_vocab); + + int max_coarse_history = params.max_coarse_history; + int sliding_window_size = params.sliding_window_size; + int n_coarse_codebooks = params.n_coarse_codebooks; + int semantic_vocab_size = params.semantic_vocab_size; + int codebook_size = params.codebook_size; + + float coarse_rate_hz = params.coarse_rate_hz; + float semantic_rate_hz = params.semantic_rate_hz; + + int32_t coarse_semantic_pad_token = params.coarse_semantic_pad_token; + int32_t coarse_infer_token = params.coarse_infer_token; + + float temp = params.temp; + + float stc_ratio = coarse_rate_hz / semantic_rate_hz * n_coarse_codebooks; + + int max_semantic_history = floorf(max_coarse_history / stc_ratio); + + int n_steps = floorf(input.size() * stc_ratio / n_coarse_codebooks) * n_coarse_codebooks; + assert(n_steps > 0); + assert(n_steps % n_coarse_codebooks == 0); + + BarkProgressBar progress(std::string("Generating coarse tokens"), n_steps); + + int n_window_steps = ceilf(static_cast(n_steps) / sliding_window_size); + + int step_idx = 0; + + for (int i = 0; i < n_window_steps; i++) { + int semantic_idx = roundf(n_steps / stc_ratio); + + bark_sequence input_in( + input.begin() + std::max(semantic_idx - max_semantic_history, 0), + input.end() + ); + + size_t original_size = input_in.size(); + input_in.resize(256); + + // padding from the right side + for (int ix = original_size; ix < 256; ix++) { + input_in[ix] = coarse_semantic_pad_token; + } + input_in.push_back(coarse_infer_token); + + // concatenate input_in and input_coarse + input_in.insert( + input_in.end(), + std::make_move_iterator(out.end() - std::min(max_coarse_history, (int) out.size())), + std::make_move_iterator(out.end()) + ); + + int n_past = 0; + + for (int j = 0; j < sliding_window_size; j++) { + if (step_idx >= n_steps) { + continue; + } + + if (!bark_eval_encoder_internal(model, allocr, input_in, logits, &n_past, false, n_threads)) { + fprintf(stderr, "%s: Could not generate token\n", __func__); + return false; + } + + input_in.clear(); + + bool is_major = step_idx % n_coarse_codebooks == 0; + int start_idx = semantic_vocab_size + (1 - is_major) * codebook_size; + int end_idx = semantic_vocab_size + (2 - is_major) * codebook_size; + + std::vector relevant_logits( + logits.begin() + start_idx, + logits.begin() + end_idx + ); + + bark_token next = gpt_sample( + relevant_logits, bctx->rng, temp, NULL, &model.t_sample_us, &model.n_sample); + + next += start_idx; + + input_in.push_back(next); + out.push_back(next); + + step_idx += 1; + + progress.update(1); + progress.print(); + } + } + + assert((int) out.size() == n_steps); + assert(out.size() % n_coarse_codebooks == 0); + + // out_coarse: [seq_length, n_codes] + for (int i = 0; i < (int) out.size(); i += n_coarse_codebooks) { + // this assumes N_COARSE_CODEBOOKS = 2 + bark_sequence _tmp = { + out[i] - semantic_vocab_size, + out[i+1] - semantic_vocab_size - codebook_size + }; + out_coarse.push_back(_tmp); + } + + bctx->coarse_tokens = out_coarse; + + return true; +} + +static bool bark_eval_fine_encoder(struct bark_context * bctx, int n_threads) { + // input shape: [N, n_codes] + bark_codes input = bctx->coarse_tokens; + + std::vector logits; + logits.resize(1024*1056); + + auto & model = bctx->model.fine_model; + auto & hparams = model.hparams; + auto & params = bctx->params; + + float temp = params.fine_temp; + + int32_t n_coarse_codebooks = params.n_coarse_codebooks; + int32_t n_fine_codebooks = params.n_fine_codebooks; + int32_t codebook_size = params.codebook_size; + + int n_coarse = input[0].size(); + int original_seq_len = input.size(); + int n_remove_from_end = 0; + + // channel padding + for (int i = 0; i < (int) input.size(); i++) { + for (int j = n_coarse_codebooks; j < n_fine_codebooks; j++) { + input[i].push_back(codebook_size); + } + } + + // spatial padding if sequence is too short + if (original_seq_len < 1024) { + n_remove_from_end = 1024 - original_seq_len; + for (int i = original_seq_len; i < 1024; i++) { + bark_sequence _tmp(n_fine_codebooks, codebook_size); + input.push_back(_tmp); + } + } + + int n_loops = std::max(0, (int) ceilf((input.size() - 1024) / 512.f)) + 1; + + bark_codes in_arr = input; // [seq_length, n_codes] + + BarkProgressBar progress(std::string("Generating fine tokens"), n_loops * (n_fine_codebooks - n_coarse)); + + for (int n = 0; n < n_loops; n++) { + int start_idx = std::min(n * 512, (int) in_arr.size() - 1024); + int start_fill_idx = std::min(n * 512, (int) in_arr.size() - 512); + int rel_start_fill_idx = start_fill_idx - start_idx; + + // in_buffer: [n_codes*seq_length] (sequences are contiguous) + bark_sequence in_buffer; + for (int i = 0; i < n_fine_codebooks; i++) { + for (int j = start_idx; j < start_idx + 1024; j++) { + in_buffer.push_back(in_arr[j][i]); + } + } + + for (int nn = n_coarse; nn < n_fine_codebooks; nn++) { + if (!bark_eval_fine_encoder_internal(bctx, in_buffer, logits, nn, n_threads)) { + fprintf(stderr, "%s: Could not generate token\n", __func__); + return false; + } + + for (int i = 0; i < 1024; i++) { + std::vector relevant_logits( + logits.begin() + i * 1056, + logits.begin() + (i + 1) * 1056 + ); + relevant_logits.resize(codebook_size); + + bark_token next = gpt_sample( + relevant_logits, bctx->rng, temp, NULL, &model.t_sample_us, + &model.n_sample); + + in_buffer[nn * 1024 + rel_start_fill_idx + i] = next; + } + + progress.update(1); + progress.print(); + } + + // transfer over info into model_in + for (int nn = n_coarse; nn < n_fine_codebooks; nn++) { + for (int j = 0; j < codebook_size - rel_start_fill_idx; j++) { + in_arr[start_fill_idx+j][nn] = in_buffer[nn * 1024 + rel_start_fill_idx + j]; + } + } + } + + if (n_remove_from_end > 0) { + in_arr.resize(in_arr.size() - n_remove_from_end); + } + + assert(bctx->coarse_tokens.size() == in_arr.size()); + + bctx->fine_tokens = in_arr; + + return true; +} + +bool bark_forward_text_encoder( + struct bark_context * bctx, + int n_threads, + bark_verbosity_level verbosity) { + const int64_t t_main_start_us = ggml_time_us(); + + auto & model = bctx->model.text_model; + auto & allocr = bctx->allocr; + auto & hparams = model.hparams; + + // allocate the compute buffer + { + // alignment required by the backend + size_t align = ggml_backend_get_alignment(model.backend); + bctx->allocr = ggml_allocr_new_measure(align); + + // create the worst-case graph for memory usage estimation + int n_past = 0; + std::vector decoy_tokens(256+256+1, 0); + struct ggml_cgraph * gf = bark_build_gpt_graph( + &model, allocr, decoy_tokens, &n_past, true /* merge_ctx */, n_threads); + + // compute the required memory + size_t mem_size = ggml_allocr_alloc_graph(bctx->allocr, gf); + + // recreate the allocator with the required memory + ggml_allocr_free(bctx->allocr); + bctx->buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size); + bctx->allocr = ggml_allocr_new_from_buffer(bctx->buf_compute); + + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size/1024.0/1024.0); + } + } + + if (!bark_eval_text_encoder(bctx, n_threads)) { + fprintf(stderr, "%s: failed to forward text encoder\n", __func__); + return false; + } + + model.t_main_us = ggml_time_us() - t_main_start_us; + + bark_print_statistics(&model); + + ggml_backend_buffer_free(bctx->buf_compute); + ggml_allocr_free(bctx->allocr); + + return true; +} + +bool bark_forward_coarse_encoder( + struct bark_context * bctx, + int n_threads, + bark_verbosity_level verbosity) { + const int64_t t_main_start_us = ggml_time_us(); + + auto & model = bctx->model.coarse_model; + auto & allocr = bctx->allocr; + auto & hparams = model.hparams; + + // allocate the compute buffer + { + // alignment required by the backend + size_t align = ggml_backend_get_alignment(model.backend); + bctx->allocr = ggml_allocr_new_measure(align); + + // create the worst-case graph for memory usage estimation + int n_past = 0; + std::vector decoy_tokens(hparams.block_size, 0); + struct ggml_cgraph * gf = bark_build_gpt_graph( + &model, allocr, decoy_tokens, &n_past, false /* merge_ctx */, n_threads); + + // compute the required memory + size_t mem_size = ggml_allocr_alloc_graph(bctx->allocr, gf); + + // recreate the allocator with the required memory + ggml_allocr_free(bctx->allocr); + bctx->buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size); + bctx->allocr = ggml_allocr_new_from_buffer(bctx->buf_compute); + + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size/1024.0/1024.0); + } + } + + if (!bark_eval_coarse_encoder(bctx, n_threads)) { + fprintf(stderr, "%s: failed to forward coarse encoder\n", __func__); + return false; + } + + model.t_main_us = ggml_time_us() - t_main_start_us; + + bark_print_statistics(&model); + + ggml_backend_buffer_free(bctx->buf_compute); + ggml_allocr_free(bctx->allocr); + + return true; +} + +bool bark_forward_fine_encoder( + struct bark_context * bctx, + int n_threads, + bark_verbosity_level verbosity) { + const int64_t t_main_start_us = ggml_time_us(); + + auto & model = bctx->model.fine_model; + auto & allocr = bctx->allocr; + auto & hparams = model.hparams; + auto & params = bctx->params; + + int32_t n_fine_codebooks = params.n_fine_codebooks; + + // allocate the compute buffer + { + // alignment required by the backend + size_t align = ggml_backend_get_alignment(model.backend); + bctx->allocr = ggml_allocr_new_measure(align); + + // create the worst-case graph for memory usage estimation + std::vector decoy_tokens(hparams.block_size*n_fine_codebooks, 0); + struct ggml_cgraph * gf = bark_build_fine_gpt_graph( + &model, allocr, decoy_tokens, 2 /* codebook_idx */, n_fine_codebooks, n_threads); + + // compute the required memory + size_t mem_size = ggml_allocr_alloc_graph(bctx->allocr, gf); + + // recreate the allocator with the required memory + ggml_allocr_free(bctx->allocr); + bctx->buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size); + bctx->allocr = ggml_allocr_new_from_buffer(bctx->buf_compute); + + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + fprintf(stderr, "%s: compute buffer size: %.2f MB\n\n", __func__, mem_size/1024.0/1024.0); + } + } + + if (!bark_eval_fine_encoder(bctx, n_threads)) { + fprintf(stderr, "%s: failed to forward coarse encoder\n", __func__); + return false; + } + + model.t_main_us = ggml_time_us() - t_main_start_us; + + bark_print_statistics(&model); + + ggml_backend_buffer_free(bctx->buf_compute); + ggml_allocr_free(bctx->allocr); + + return true; +} + +static bool bark_forward_eval( + struct bark_context * bctx, + int n_threads, + bark_verbosity_level verbosity) { + if (!bark_forward_text_encoder(bctx, n_threads, verbosity)) { + fprintf(stderr, "%s: failed to forward text encoder\n", __func__); + return false; + } + + if (!bark_forward_coarse_encoder(bctx, n_threads, verbosity)) { + fprintf(stderr, "%s: failed to forward coarse encoder\n", __func__); + return false; + } + + if (!bark_forward_fine_encoder(bctx, n_threads, verbosity)) { + fprintf(stderr, "%s: failed to forward fine encoder\n", __func__); + return false; + } + + return true; +} + +bool bark_generate_audio( + struct bark_context * bctx, + std::string & text, + std::string & dest_wav_path, + int n_threads, + bark_verbosity_level verbosity) { + if (!bctx) { + fprintf(stderr, "%s: invalid bark context\n", __func__); + return false; + } + + int64_t t_start_eval_us = ggml_time_us(); + + bark_tokenize_input(bctx, text); + + if (!bark_forward_eval(bctx, n_threads, verbosity)) { + fprintf(stderr, "%s: failed to forward eval\n", __func__); + return false; + } + + // Calling Encodec API to generate audio wavefrom from tokens + const int n_gpu_layers = bctx->n_gpu_layers; + const std::string encodec_model_path = bctx->encodec_model_path; + + struct encodec_context * ectx = encodec_load_model( + encodec_model_path, n_gpu_layers, encodec_verbosity_level::LOW); + if (!ectx) { + printf("%s: error during loading encodec model\n", __func__); + return false; + } + + auto & params = bctx->params; + + int32_t target_bandwidth = params.target_bandwidth; + int32_t sample_rate = params.sample_rate; + + encodec_set_target_bandwidth(ectx, target_bandwidth); + encodec_set_sample_rate(ectx, sample_rate); + + // current shape fine_tokens: [seq_length][n_channels], n_channels are contiguous + // encodec expects shape fine_tokens: [n_channels][seq_length], time steps are contiguous + std::vector encodec_tokens; + + // copy fine_tokens into encodec_tokens by transposing to abide by encodec's shape + for (int i = 0; i < (int) bctx->fine_tokens[0].size(); i++) { + for (int j = 0; j < (int) bctx->fine_tokens.size(); j++) { + encodec_tokens.push_back(bctx->fine_tokens[j][i]); + } + } + + if (!encodec_decompress_audio(ectx, encodec_tokens, n_threads)) { + printf("%s: Could not generate waveform from tokens with Encodec\n", __func__); + return false; + } + + bctx->audio_arr = ectx->out_audio; + + encodec_free(ectx); + + bctx->t_eval_us = ggml_time_us() - t_start_eval_us; + + return true; +} + +static void bark_free_model(struct gpt_model * model) { + if (!model) { + return; + } + + if(model->ctx) { + ggml_free(model->ctx); + } + + ggml_backend_buffer_free(model->buffer_w); + ggml_backend_free(model->backend); +} + +void bark_free(struct bark_context * bctx) { + if (!bctx) { + return; + } + + bark_free_model(&bctx->model.text_model); + bark_free_model(&bctx->model.coarse_model); + bark_free_model(&bctx->model.fine_model); + + delete bctx; +} + +static struct bark_model * bark_load_model_from_file( + const std::string & dirname, + struct bark_model * model, + int n_gpu_layers, + bark_verbosity_level verbosity) { + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + printf("%s: loading model from '%s'\n", __func__, dirname.c_str()); + } + + // text + { + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + printf("%s: reading bark text model\n", __func__); + } + + const std::string fname = std::string(dirname) + "/ggml_weights_text.bin"; + if (!gpt_load_model_weights(fname, model->text_model, n_gpu_layers, verbosity)) { + fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str()); + return nullptr; + } + } + + // vocab + { + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + printf("%s: reading bark vocab\n", __func__); + } + + const std::string fname = std::string(dirname) + "/ggml_vocab.bin"; + const gpt_hparams hparams = model->text_model.hparams; + const int32_t expected_size = hparams.n_in_vocab - hparams.n_out_vocab - 5; + + if (!bark_vocab_load(fname, &model->vocab, expected_size)) { + fprintf(stderr, "%s: invalid model file '%s' (bad text)\n", __func__, fname.c_str()); + return nullptr; + } + } + + // coarse + { + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + printf("\n%s: reading bark coarse model\n", __func__); + } + + const std::string fname = std::string(dirname) + "/ggml_weights_coarse.bin"; + + if (!gpt_load_model_weights(fname, model->coarse_model, n_gpu_layers, verbosity)) { + fprintf(stderr, "%s: invalid model file '%s' (bad coarse)\n", __func__, fname.c_str()); + return nullptr; + } + } + + // fine + { + if (verbosity == bark_verbosity_level::MEDIUM || verbosity == bark_verbosity_level::HIGH) { + printf("\n%s: reading bark fine model\n", __func__); + } + + const std::string fname = std::string(dirname) + "/ggml_weights_fine.bin"; + + if (!gpt_load_model_weights(fname, model->fine_model, n_gpu_layers, verbosity)) { + fprintf(stderr, "%s: invalid model file '%s' (bad fine)\n", __func__, fname.c_str()); + return nullptr; + } + } + + printf("\n"); + + return model; +} + +struct bark_context_params bark_context_default_params() { + struct bark_context_params result = { + /*.seed =*/ 0, + /*.temp =*/ 0.7, + /*.fine_temp =*/ 0.5, + /*.min_eos_p =*/ 0.2, + /*.sliding_window_size =*/ 60, + /*.max_coarse_history =*/ 630, + /*.sample_rate =*/ 24000, + /*.target_bandwidth =*/ 12, + /*.cls_token_id =*/ 101, + /*.sep_token_id =*/ 102, + /*.n_steps_text_encoder =*/ 768, + /*.text_pad_token =*/ 129595, + /*.text_encoding_offset =*/ 10048, + /*.semantic_rate_hz =*/ 49.9f, + /*.semantic_pad_token =*/ 10000, + /*.semantic_vocab_size =*/ 10000, + /*.semantic_infer_token =*/ 129599, + /*.coarse_rate_hz =*/ 75.0f, + /*.coarse_infer_token =*/ 12050, + /*.coarse_semantic_pad_token =*/ 12048, + /*.n_coarse_codebooks =*/ 2, + /*.n_fine_codebooks =*/ 8, + /*.codebook_size =*/ 1024, + }; + + return result; +} + +struct bark_context * bark_load_model( + const std::string & model_path, + bark_verbosity_level verbosity) { + int64_t t_load_start_us = ggml_time_us(); + + struct bark_context * bctx = new bark_context(); + + bctx->model = bark_model(); + if (!bark_load_model_from_file(model_path, &bctx->model, bctx->n_gpu_layers, verbosity)) { + fprintf(stderr, "%s: failed to load model weights from '%s'\n", __func__, model_path.c_str()); + return {}; + } + + bark_context_params params = bark_context_default_params(); + bctx->rng = std::mt19937(params.seed); + + bctx->params = params; + + bctx->t_load_us = ggml_time_us() - t_load_start_us; + + return bctx; +} + +bool bark_model_quantize( + const std::string & fname_inp, + const std::string & fname_out, + ggml_ftype ftype) { + printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); + + gpt_model model; + + auto fin = std::ifstream(fname_inp, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str()); + return false; + } + + auto fout = std::ofstream(fname_out, std::ios::binary); + if (!fout) { + fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); + return false; + } + + fout.write((char *) &magic, sizeof(magic)); + } + + gpt_hparams hparams; + + // load hparams + { + auto & hparams = model.hparams; + + read_safe(fin, hparams.n_layer); + read_safe(fin, hparams.n_head); + read_safe(fin, hparams.n_embd); + read_safe(fin, hparams.block_size); + read_safe(fin, hparams.bias); + read_safe(fin, hparams.n_in_vocab); + read_safe(fin, hparams.n_out_vocab); + read_safe(fin, hparams.n_lm_heads); + read_safe(fin, hparams.n_wtes); + read_safe(fin, hparams.ftype); + + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: n_in_vocab = %d\n", __func__, hparams.n_in_vocab); + printf("%s: n_out_vocab = %d\n", __func__, hparams.n_out_vocab); + printf("%s: block_size = %d\n", __func__, hparams.block_size); + printf("%s: bias = %d\n", __func__, hparams.bias); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: n_lm_heads = %d\n", __func__, hparams.n_lm_heads); + printf("%s: n_wtes = %d\n", __func__, hparams.n_wtes); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); + + write_safe(fout, hparams.n_layer); + write_safe(fout, hparams.n_head); + write_safe(fout, hparams.n_embd); + write_safe(fout, hparams.block_size); + write_safe(fout, hparams.bias); + write_safe(fout, hparams.n_in_vocab); + write_safe(fout, hparams.n_out_vocab); + write_safe(fout, hparams.n_lm_heads); + write_safe(fout, hparams.n_wtes); + write_safe(fout, ftype_dst); + } + + // regexes of tensor names to be quantized + const std::vector to_quant = { + "model/wte/.*", + "model/lm_head/.*", + "model/h.*/attn/c_attn/w", + "model/h.*/attn/c_proj/w", + "model/h.*/mlp/c_fc/w", + "model/h.*/mlp/c_proj/w", + }; + + if (!ggml_quantize_weights(fin, fout, ftype, to_quant, {})) { + fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str()); + return false; + } + + fin.close(); + fout.close(); + + return true; +} diff --git a/bark/bark.h b/bark/bark.h new file mode 100644 index 0000000..495a2c9 --- /dev/null +++ b/bark/bark.h @@ -0,0 +1,347 @@ +#include "ggml.h" +#include "ggml-backend.h" + +#include +#include +#include +#include + +#ifdef BARK_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef BARK_BUILD +# define BARK_API __declspec(dllexport) +# else +# define BARK_API __declspec(dllimport) +# endif +# else +# define BARK_API __attribute__ ((visibility ("default"))) +# endif +#else +# define BARK_API +#endif + +enum class bark_verbosity_level { + LOW = 0, + MEDIUM = 1, + HIGH = 2, +}; + +typedef int32_t bark_token; + +typedef std::vector bark_sequence; +typedef std::vector> bark_codes; + +struct gpt_hparams { + int32_t n_in_vocab; + int32_t n_out_vocab; + int32_t n_layer; + int32_t n_head; + int32_t n_embd; + int32_t block_size; + int32_t n_lm_heads; + int32_t n_wtes; + int32_t ftype; + int32_t bias; + + int32_t n_codes_given = 1; +}; + +struct bark_vocab { + using id = int32_t; + using token = std::string; + + std::map token_to_id; + std::map id_to_token; +}; + +struct gpt_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // attention + struct ggml_tensor * c_attn_attn_w; + struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // mlp + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; +}; + +struct gpt_model { + gpt_hparams hparams; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wpe; // position embedding + std::vector wtes; // token embedding + std::vector lm_heads; // language model head + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + struct ggml_context * ctx; + + ggml_backend_t backend = NULL; + + ggml_backend_buffer_t buffer_w; + ggml_backend_buffer_t buffer_kv; + + std::map tensors; + + // + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + int64_t t_main_us = 0; + + // + int64_t n_sample = 0; + + // + int64_t memsize = 0; +}; + +struct bark_model { + // encoder + gpt_model coarse_model; + gpt_model fine_model; + gpt_model text_model; + + // vocab + bark_vocab vocab; +}; + +struct bark_context_params { + // RNG seed + uint32_t seed; + + // Temperature for sampling (text and coarse encoders) + float temp; + // Temperature for sampling (fine encoder) + float fine_temp; + + // Minimum probability for EOS token (text encoder) + float min_eos_p; + // Sliding window size for coarse encoder + int32_t sliding_window_size; + // Max history for coarse encoder + int32_t max_coarse_history; + + // Sample rate + int32_t sample_rate; + // Target bandwidth + int32_t target_bandwidth; + + // CLS token ID + int32_t cls_token_id; + // SEP token ID + int32_t sep_token_id; + + // Maximum number of semantic tokens to generate + int32_t n_steps_text_encoder; + + // Text PAD token ID + int32_t text_pad_token; + // Text encoding offset + int32_t text_encoding_offset; + + // Semantic frequency rate + float semantic_rate_hz; + // Semantic PAD token ID + int32_t semantic_pad_token; + // Vocabulary size in semantic encoder + int32_t semantic_vocab_size; + // Semantic infernce token ID + int32_t semantic_infer_token; + + // Coarse frequency rate + float coarse_rate_hz; + // Coarse infer token ID + int32_t coarse_infer_token; + // Coarse semantic pad token ID + int32_t coarse_semantic_pad_token; + + // Number of codebooks in coarse encoder + int32_t n_coarse_codebooks; + // Number of codebooks in fine encoder + int32_t n_fine_codebooks; + // Dimension of the codes + int32_t codebook_size; +}; + +struct bark_context { + bark_model model; + + // buffer for model evaluation + ggml_backend_buffer_t buf_compute; + + // custom allocator + struct ggml_allocr * allocr = NULL; + int n_gpu_layers = 0; + + std::mt19937 rng; + + bark_sequence tokens; + bark_sequence semantic_tokens; + + bark_codes coarse_tokens; + bark_codes fine_tokens; + + std::vector audio_arr; + + // hyperparameters + bark_context_params params; + + // statistics + int64_t t_load_us = 0; + int64_t t_eval_us = 0; + + // encodec parameters + std::string encodec_model_path; +}; + +/** + * @brief Returns the default parameters for a bark context. + * + * @return bark_context_params The default parameters for a bark context. + */ +BARK_API struct bark_context_params bark_context_default_params(void); + +/** + * Loads a BARK model from the specified file path with the given parameters. + * + * @param model_path The directory path of the bark model to load. + * @param verbosity The verbosity level when loading the model. + * @return A pointer to the loaded bark model context. + */ +BARK_API struct bark_context * bark_load_model( + const std::string & model_path, + bark_verbosity_level verbosity); + +/** + * Generates an audio file from the given text using the specified Bark context. + * + * @param bctx The Bark context to use for generating the audio. + * @param text The text to generate audio from. + * @param dest_wav_path The path to save the generated audio file. + * @param n_threads The number of threads to use for generating the audio. + * @param verbosity The verbosity level when generating the audio. + * @return An integer indicating the success of the audio generation process. + */ +BARK_API bool bark_generate_audio( + bark_context * bctx, + std::string & text, + std::string & dest_wav_path, + int n_threads, + bark_verbosity_level verbosity); + +/** + * Quantizes a bark model and saves the result to a file. + * + * @param fname_inp The name of the input file containing the BARK model. + * @param fname_out The name of the output file to save the quantized model to. + * @param ftype The type of the model's floating-point values. + * @return True if the model was successfully quantized and saved, false otherwise. + */ +BARK_API bool bark_model_quantize( + const std::string & fname_inp, + const std::string & fname_out, + ggml_ftype ftype); + +/** + * @brief Frees the memory allocated for a bark context. + * + * @param bctx The bark context to free. + */ +BARK_API void bark_free( + struct bark_context * bctx); + +/** + * Loads a vocabulary from a file. + * + * @param fname The name of the file to load the vocabulary from. + * @param vocab A pointer to the bark_vocab struct to store the loaded vocabulary in. + * @param expected_size The expected size of the vocabulary. + * @return true if the vocabulary was loaded successfully, false otherwise. + */ +bool bark_vocab_load( + const std::string & fname, + bark_vocab * vocab, + int32_t expected_size); + +/** + * Tokenizes the input text using the provided vocabulary. + * + * @param vocab Pointer to the vocabulary to use for tokenization. + * @param text The input text to tokenize. + * @param tokens Pointer to an array where the resulting tokens will be stored. + * @param n_tokens Pointer to an integer where the number of resulting tokens will be stored. + * @param n_max_tokens The maximum number of tokens that can be stored in the tokens array. + */ +void bert_tokenize( + const bark_vocab * vocab, + const char * text, + int32_t * tokens, + int32_t * n_tokens, + int32_t n_max_tokens); + +/** + * Encodes the input text using the forward algorithm. + * + * @param bctx A pointer to the bark context struct. + * @param n_threads The number of threads to use for encoding. + * @param verbosity The verbosity level when encoding. + * @return Returns true if the encoding was successful, false otherwise. + */ +bool bark_forward_text_encoder( + struct bark_context * bctx, + int n_threads, + bark_verbosity_level verbosity); + +/** + * \brief Encodes the input data using the coarse encoder in the bark library. + * + * This function encodes the input data using the coarse encoder in the bark library. + * It takes a bark_context structure pointer, the number of threads to use, and the verbosity level as parameters. + * + * \param bctx The bark_context structure pointer. + * \param n_threads The number of threads to use for encoding. + * \param verbosity The verbosity level for logging. + * \return Returns true if the encoding is successful, false otherwise. + */ +bool bark_forward_coarse_encoder( + struct bark_context * bctx, + int n_threads, + bark_verbosity_level verbosity); + +/** + * @brief Performs forward fine encoding using the specified bark context. + * + * This function encodes the input data using the bark context provided. It performs + * the encoding operation in parallel using the specified number of threads. The + * verbosity level can be used to control the amount of logging information printed + * during the encoding process. + * + * @param bctx The bark context used for encoding. + * @param n_threads The number of threads to use for parallel encoding. + * @param verbosity The verbosity level for logging information. + * @return True if the encoding operation was successful, false otherwise. + */ +bool bark_forward_fine_encoder( + struct bark_context * bctx, + int n_threads, + bark_verbosity_level verbosity); diff --git a/convert.py b/bark/convert.py similarity index 68% rename from convert.py rename to bark/convert.py index 6162aae..fd3f613 100644 --- a/convert.py +++ b/bark/convert.py @@ -11,21 +11,11 @@ - Name (char[name_length]) - Data (float[n_dims]) -Note ----- -Encodec uses weight normalization for its convolutional layers. All the weights are -decomposed into two tensors called with the suffixes _weight_v and _weight_g. A simple -call to the hook torch._weight_norm allows to get the final weight tensor of the -convolution from weight_v and weight_g. To drastically reduce the number of operations -at inference time, the ggml weights file only contain the final convolution weights but -does not store the decomposition into weight_v and weight_g. - Example ------- ```bash python convert.py \ --dir-model ~/.cache/suno/bark_v0 \ - --codec-path ~/Documents/encodec.cpp/ggml_weights \ --vocab-path ./ggml_weights/ \ --out-dir ./ggml_weights/ \ --use-f16 @@ -42,77 +32,21 @@ parser = argparse.ArgumentParser() parser.add_argument("--dir-model", type=str, required=True) -parser.add_argument("--codec-path", type=str, required=True) parser.add_argument("--vocab-path", type=str, required=True) parser.add_argument("--out-dir", type=str, required=True) parser.add_argument("--use-f16", action="store_true") -def parse_codec_model(checkpoint, out_dir): - """Load encodec model checkpoint.""" - outfile = open(out_dir, "wb") - outfile.write(struct.pack("i", 0x67676d6c)) # ggml magic - - for name in checkpoint.keys(): - if "encoder." in name: - # bark only uses Encodec's quantizer and decoder. - continue - - if "weight_g" in name: - # the tensor has already been parsed with the corresponding "weight_v" - # tensor to form the final weights tensor of the convolution, therefore - # we skip it - continue - - if "inited" in name or "cluster_size" in name or "embed_avg" in name: - # "inited", "cluster_size" and "embed_avg" tensors in quantizer are not used - # for the forward pass - continue - - var_data = checkpoint[name] - - if not "weight_v" in name: - # if conv kernel, do not squeeze because 3d tensor - var_data = var_data.numpy().squeeze() - else: - # weight_v has its corresponding magnitude tensor to rescale the weights - # of the convolutional layers. We parse both kinds of weights jointly to - # build the final weight tensor of the convolution. - base_name = name.split(".")[:-1] - weight_g_name = ".".join(base_name + ["weight_g"]) - var_data_g = checkpoint[weight_g_name] - - final_var_data = torch._weight_norm(var_data, var_data_g, dim=0) - var_data = final_var_data.numpy() - - name = ".".join(base_name + ["weight"]) - - print(f"Processing variable: {name} with shape: {var_data.shape}") - - if var_data.dtype != np.float32: - print(" Converting to float32") - var_data = var_data.astype(np.float32) - - n_dims = len(var_data.shape) - encoded_name = name.encode("utf-8") - ftype = 0 # float32 - outfile.write(struct.pack("iii", n_dims, len(encoded_name), ftype)) - - for i in range(n_dims): - outfile.write(struct.pack("i", var_data.shape[n_dims - 1 - i])) - outfile.write(encoded_name) - - var_data.tofile(outfile) - - outfile.close() - -def parse_hparams(hparams, outfile, use_f16): +def parse_hparams(hparams, outfile, use_f16, overwrite_bias): """Parse GPT hyperparameters.""" outfile.write(struct.pack("i", hparams["n_layer"])) outfile.write(struct.pack("i", hparams["n_head"])) outfile.write(struct.pack("i", hparams["n_embd"])) outfile.write(struct.pack("i", hparams["block_size"])) + bias = 1 if overwrite_bias else hparams["bias"] + outfile.write(struct.pack("i", int(bias))) + try: outfile.write(struct.pack("ii", hparams["vocab_size"], hparams["vocab_size"])) except KeyError: @@ -127,7 +61,7 @@ def parse_hparams(hparams, outfile, use_f16): n_wtes = hparams["n_codes_total"] except KeyError: n_lm_heads, n_wtes = 1, 1 - + ftype = int(use_f16) outfile.write(struct.pack("iii", n_lm_heads, n_wtes, ftype)) @@ -140,12 +74,6 @@ def parse_text_models(checkpoint, outfile, use_f16): n_dims = len(var_data.shape) - # ftype_cur = 0 - # if var_data.dtype != np.float32: - # print(" Converting to float32") - # var_data = var_data.astype(np.float32) - # ftype_cur = 0 - # strip `_orig_mod.transformer.` prefix if name == "_orig_mod.lm_head.weight": name = "lm_head.weight" @@ -233,12 +161,12 @@ def parse_text_models(checkpoint, outfile, use_f16): var_data.tofile(outfile) -def generate_file(in_file, out_dir, use_f16): +def generate_file(in_file, out_dir, use_f16, overwrite_bias=False): with open(out_dir, "wb") as fout: fout.write(struct.pack("i", 0x67676d6c)) # ggml magic checkpoint = torch.load(in_file, map_location="cpu") - parse_hparams(checkpoint["model_args"], fout, use_f16) + parse_hparams(checkpoint["model_args"], fout, use_f16, overwrite_bias) parse_text_models(checkpoint["model"], fout, use_f16) def generate_vocab_file(dir_model, out_dir): @@ -262,7 +190,6 @@ def generate_vocab_file(dir_model, out_dir): args = parser.parse_args() dir_model = Path(args.dir_model) - codec_path = Path(args.codec_path) vocab_path = Path(args.vocab_path) out_dir = Path(args.out_dir) @@ -277,11 +204,9 @@ def generate_vocab_file(dir_model, out_dir): generate_file(dir_model / "coarse_2.pt", out_dir / "ggml_weights_coarse.bin", args.use_f16) print(" Coarse model loaded.") - generate_file(dir_model / "fine_2.pt", out_dir / "ggml_weights_fine.bin", args.use_f16) + # overwrite_bias set to True since the fine model has biases and current config file + # has bias set to False + generate_file(dir_model / "fine_2.pt", out_dir / "ggml_weights_fine.bin", args.use_f16, overwrite_bias=True) print(" Fine model loaded.") - codec_chkpt = torch.load(codec_path / "encodec_24khz-d7cc33bc.th", map_location="cpu") - parse_codec_model(codec_chkpt, out_dir / "ggml_weights_codec.bin") - print(" Codec model loaded.") - print("Done.") diff --git a/download_weights.py b/bark/download_weights.py similarity index 100% rename from download_weights.py rename to bark/download_weights.py diff --git a/bark/examples/CMakeLists.txt b/bark/examples/CMakeLists.txt new file mode 100644 index 0000000..4a09e81 --- /dev/null +++ b/bark/examples/CMakeLists.txt @@ -0,0 +1,7 @@ +add_library(common STATIC common.cpp) +target_include_directories(common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_compile_features(common PRIVATE cxx_std_11) + +add_subdirectory(main) +add_subdirectory(server) +add_subdirectory(quantize) \ No newline at end of file diff --git a/bark/examples/common.cpp b/bark/examples/common.cpp new file mode 100644 index 0000000..b8cad4f --- /dev/null +++ b/bark/examples/common.cpp @@ -0,0 +1,69 @@ +#include +#include +#include + +#define DR_WAV_IMPLEMENTATION +#include "dr_wav.h" + +#include "common.h" + +#define SAMPLE_RATE 24000 + +void write_wav_on_disk(std::vector & audio_arr, std::string dest_path) { + drwav_data_format format; + format.bitsPerSample = 32; + format.sampleRate = SAMPLE_RATE; + format.container = drwav_container_riff; + format.channels = 1; + format.format = DR_WAVE_FORMAT_IEEE_FLOAT; + + drwav wav; + drwav_init_file_write(&wav, dest_path.c_str(), &format, NULL); + drwav_uint64 frames = drwav_write_pcm_frames(&wav, audio_arr.size(), audio_arr.data()); + drwav_uninit(&wav); + + fprintf(stderr, "%s: Number of frames written = %lld.\n", __func__, frames); +} + +void bark_print_usage(char ** argv, const bark_params & params) { + std::cout << "usage: " << argv[0] << " [options]\n" + << "\n" + << "options:\n" + << " -h, --help show this help message and exit\n" + << " -t N, --threads N number of threads to use during computation (default: " << params.n_threads << ")\n" + << " -s N, --seed N seed for random number generator (default: " << params.seed << ")\n" + << " -p PROMPT, --prompt PROMPT\n" + << " prompt to start generation with (default: random)\n" + << " -m FNAME, --model FNAME\n" + << " model path (default: " << params.model_path << ")\n" + << " -o FNAME, --outwav FNAME\n" + << " output generated wav (default: " << params.dest_wav_path << ")\n" + << "\n"; +} + +int bark_params_parse(int argc, char ** argv, bark_params & params) { + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + + if (arg == "-t" || arg == "--threads") { + params.n_threads = std::stoi(argv[++i]); + } else if (arg == "-p" || arg == "--prompt") { + params.prompt = argv[++i]; + } else if (arg == "-m" || arg == "--model") { + params.model_path = argv[++i]; + } else if (arg == "-s" || arg == "--seed") { + params.seed = std::stoi(argv[++i]); + } else if (arg == "-o" || arg == "--outwav") { + params.dest_wav_path = argv[++i]; + } else if (arg == "-h" || arg == "--help") { + bark_print_usage(argv, params); + exit(0); + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + bark_print_usage(argv, params); + exit(0); + } + } + + return 0; +} diff --git a/bark/examples/common.h b/bark/examples/common.h new file mode 100644 index 0000000..e347e55 --- /dev/null +++ b/bark/examples/common.h @@ -0,0 +1,48 @@ +#include +#include +#include + +struct bark_params { + // Number of threads used for audio generation. + int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); + + // User prompt. + std::string prompt = "This is an audio generated by bark.cpp"; + + // Location of model weights. + std::string model_path = "./ggml_weights"; + + // Destination path for generated WAV file. + std::string dest_wav_path = "output.wav"; + + // Seed for reproducibility in token sampling. + int32_t seed = 0; +}; + +/** + * @brief Writes a WAV file from disk and stores the audio data in a vector of floats. + * + * @param in_path Path to the input WAV file. + * @param audio_arr Vector to store the audio data. + * @return true If the file was successfully read. + * @return false If the file could not be read. + */ +void write_wav_on_disk(std::vector & audio_arr, std::string dest_path); + +/** + * @brief Parses command line arguments and stores them in a bark_params struct. + * + * @param argc The number of command line arguments. + * @param argv An array of C-strings containing the command line arguments. + * @param params A reference to a bark_params struct where the parsed arguments will be stored. + * @return int Returns 0 if the parsing was successful, otherwise returns a non-zero value. + */ +int bark_params_parse(int argc, char ** argv, bark_params & params); + +/** + * Prints the usage information for the bark command-line tool. + * + * @param argv The command-line arguments passed to the program. + * @param params The parameters used by the bark command-line tool. + */ +void bark_print_usage(char ** argv, const bark_params & params); diff --git a/dr_wav.h b/bark/examples/dr_wav.h similarity index 100% rename from dr_wav.h rename to bark/examples/dr_wav.h diff --git a/examples/main/CMakeLists.txt b/bark/examples/main/CMakeLists.txt similarity index 64% rename from examples/main/CMakeLists.txt rename to bark/examples/main/CMakeLists.txt index fd8855f..3d35bf7 100644 --- a/examples/main/CMakeLists.txt +++ b/bark/examples/main/CMakeLists.txt @@ -1,9 +1,6 @@ set(TARGET main) - add_executable(${TARGET} main.cpp) - -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE bark.cpp ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE bark common) target_compile_features(${TARGET} PRIVATE cxx_std_11) if(MSVC) diff --git a/bark/examples/main/main.cpp b/bark/examples/main/main.cpp new file mode 100644 index 0000000..6357bf0 --- /dev/null +++ b/bark/examples/main/main.cpp @@ -0,0 +1,60 @@ +#include +#include + +#include "ggml.h" +#include "bark.h" +#include "common.h" + + +int main(int argc, char **argv) { + ggml_time_init(); + const int64_t t_main_start_us = ggml_time_us(); + + bark_params params; + bark_verbosity_level verbosity = bark_verbosity_level::LOW; + + if (bark_params_parse(argc, argv, params) > 0) { + fprintf(stderr, "%s: Could not parse arguments\n", __func__); + return 1; + } + + std::cout << R"( __ __ )" << "\n" + << R"( / /_ ____ ______/ /__ _________ ____ )" << "\n" + << R"( / __ \/ __ `/ ___/ //_/ / ___/ __ \/ __ \)" << "\n" + << R"( / /_/ / /_/ / / / ,< _ / /__/ /_/ / /_/ /)" << "\n" + << R"(/_.___/\__,_/_/ /_/|_| (_) \___/ .___/ .___/ )" << "\n" + << R"( /_/ /_/ )" << "\n"; + + // initialize bark context + struct bark_context * bctx = bark_load_model(params.model_path, verbosity); + if (!bctx) { + fprintf(stderr, "%s: Could not load model\n", __func__); + exit(1); + } + + // TODO: for now, hardcoding the Encodec model path + bctx->encodec_model_path = "/Users/pbannier/Documents/encodec.cpp/ggml_weights/ggml-model.bin"; + + // generate audio + if (!bark_generate_audio(bctx, params.prompt, params.dest_wav_path, params.n_threads, verbosity)) { + fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__); + exit(1); + } + + auto & audio_arr = bctx->audio_arr; + write_wav_on_disk(audio_arr, params.dest_wav_path); + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: load time = %8.2f ms\n", __func__, bctx->t_load_us/1000.0f); + printf("%s: eval time = %8.2f ms\n", __func__, bctx->t_eval_us/1000.0f); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + bark_free(bctx); + + return 0; +} diff --git a/bark/examples/quantize/CMakeLists.txt b/bark/examples/quantize/CMakeLists.txt new file mode 100644 index 0000000..c406ab1 --- /dev/null +++ b/bark/examples/quantize/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET quantize) +add_executable(${TARGET} main.cpp) +target_link_libraries(${TARGET} PRIVATE bark) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/quantize.cpp b/bark/examples/quantize/main.cpp similarity index 95% rename from examples/quantize/quantize.cpp rename to bark/examples/quantize/main.cpp index bbfb403..3139f90 100644 --- a/examples/quantize/quantize.cpp +++ b/bark/examples/quantize/main.cpp @@ -1,4 +1,4 @@ -/*This script quantizes the weights of the 3 GPT encoders. 5 quantization types are +/* This script quantizes the weights of the 3 GPT encoders. 5 quantization types are available: - q4_0 - q4_1 @@ -7,14 +7,15 @@ - q8_0 Usage: +```bash ./quantize \ ./ggml_weights/ggml_weights_text.bin \ ./ggml_weights_q4/ggml_weights_text_quant.bin \ - type + q4_0 +``` */ #include "ggml.h" #include "bark.h" -#include "bark-util.h" #include #include diff --git a/examples/server/CMakeLists.txt b/bark/examples/server/CMakeLists.txt similarity index 59% rename from examples/server/CMakeLists.txt rename to bark/examples/server/CMakeLists.txt index 71dba1b..b6a6f8b 100644 --- a/examples/server/CMakeLists.txt +++ b/bark/examples/server/CMakeLists.txt @@ -2,4 +2,4 @@ set(TARGET server) add_executable(${TARGET} server.cpp httplib.h json.hpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE bark.cpp ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE bark ${CMAKE_THREAD_LIBS_INIT}) diff --git a/examples/server/httplib.h b/bark/examples/server/httplib.h similarity index 100% rename from examples/server/httplib.h rename to bark/examples/server/httplib.h diff --git a/examples/server/json.hpp b/bark/examples/server/json.hpp similarity index 100% rename from examples/server/json.hpp rename to bark/examples/server/json.hpp diff --git a/examples/server/server.cpp b/bark/examples/server/server.cpp similarity index 85% rename from examples/server/server.cpp rename to bark/examples/server/server.cpp index b724784..d9a1c45 100644 --- a/examples/server/server.cpp +++ b/bark/examples/server/server.cpp @@ -96,23 +96,13 @@ int main(int argc, char ** argv) { bark_params_parse(argc, argv, params); - // create model - bark_model * model = bark_load_model_from_file(params.model_path.c_str()); - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model_path.c_str()); + struct bark_context * bctx = bark_load_model(params.model_path.c_str(), bark_verbosity_level::LOW); + if (!bctx) { + fprintf(stderr, "%s: Could not load model\n", __func__); return 1; } - // create params - bark_context_params bctx_params = bark_context_default_params(); - bark_context * bctx = bark_new_context_with_model(model, bctx_params); - if (bctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model_path.c_str()); - bark_free_model(model); - return 1; - } - - bark_seed_rng(bctx, params.seed); + // bark_seed_rng(bctx, params.seed); std::mutex bark_mutex; @@ -135,13 +125,13 @@ int main(int argc, char ** argv) { std::string text = jreq.at("text"); // generate audio - bark_generate_audio(bctx, text.c_str(), "/tmp/bark_tmp.wav", params.n_threads); + std::string dest_wav_path = "/tmp/bark_tmp.wav"; + bark_generate_audio(bctx, text, dest_wav_path, params.n_threads, bark_verbosity_level::LOW); // read audio as binary std::ifstream wav_file("/tmp/bark_tmp.wav", std::ios::binary); - if (wav_file.is_open()) - { + if (wav_file.is_open()) { // Read the contents of the WAV file std::string wav_contents((std::istreambuf_iterator(wav_file)), std::istreambuf_iterator()); @@ -152,8 +142,7 @@ int main(int argc, char ** argv) { // Set the response body to the WAV file contents res.set_content(wav_contents, "audio/wav"); } - else - { + else { // If the file cannot be opened, set a 500 Internal Server Error response res.status = 500; res.set_content("Internal Server Error", "text/plain"); @@ -169,8 +158,7 @@ int main(int argc, char ** argv) { svr.set_read_timeout(params.sparams.read_timeout); svr.set_write_timeout(params.sparams.write_timeout); - if (!svr.bind_to_port(params.sparams.hostname, params.sparams.port)) - { + if (!svr.bind_to_port(params.sparams.hostname, params.sparams.port)) { fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.sparams.hostname.c_str(), params.sparams.port); return 1; @@ -183,8 +171,7 @@ int main(int argc, char ** argv) { printf("\nbark server listening at http://%s:%d\n\n", params.sparams.hostname.c_str(), params.sparams.port); - if (!svr.listen_after_bind()) - { + if (!svr.listen_after_bind()) { return 1; } diff --git a/requirements.txt b/bark/requirements.txt similarity index 100% rename from requirements.txt rename to bark/requirements.txt diff --git a/bark/tests/CMakeLists.txt b/bark/tests/CMakeLists.txt new file mode 100644 index 0000000..43ca0f3 --- /dev/null +++ b/bark/tests/CMakeLists.txt @@ -0,0 +1,26 @@ +add_library(test_utils STATIC common.cpp) +target_include_directories(test_utils PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +target_compile_features(test_utils PRIVATE cxx_std_11) + +# +# test-tokenizer + +set(TEST_TARGET test-tokenizer) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE bark) + + +# +# test-coarse-encoder + +set(TEST_TARGET test-coarse-encoder) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE bark) + + +# +# test-fine-encoder + +set(TEST_TARGET test-fine-encoder) +add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE bark) diff --git a/tests/common.cpp b/bark/tests/common.cpp similarity index 98% rename from tests/common.cpp rename to bark/tests/common.cpp index 164a471..7fe11ea 100644 --- a/tests/common.cpp +++ b/bark/tests/common.cpp @@ -1,12 +1,10 @@ +#include #include -#include #include +#include -#include "bark-util.h" #include "common.h" -#define BARK_API_INTERNAL - int64_t bytes_left(std::ifstream & f) { // utils to check all bytes are read from stream int64_t curr_pos = f.tellg(); @@ -16,6 +14,11 @@ int64_t bytes_left(std::ifstream & f) { return bytes_left_to_read; } +template +static void read_safe(std::ifstream& fin, T& dest) { + fin.read((char*)& dest, sizeof(T)); +} + template inline bool all_close( std::vector s1, diff --git a/tests/common.h b/bark/tests/common.h similarity index 97% rename from tests/common.h rename to bark/tests/common.h index db89b67..6090e56 100644 --- a/tests/common.h +++ b/bark/tests/common.h @@ -1,5 +1,4 @@ #pragma once -#include "bark.h" #include #include @@ -9,6 +8,8 @@ typedef std::vector logit_sequence; typedef std::vector> logit_matrix; +typedef std::vector> bark_codes; + /* Comparison utils */ template inline bool all_equal(std::vector s1, std::vector s2, int * n_violations); diff --git a/bark/tests/test-coarse-encoder.cpp b/bark/tests/test-coarse-encoder.cpp new file mode 100644 index 0000000..9918c5a --- /dev/null +++ b/bark/tests/test-coarse-encoder.cpp @@ -0,0 +1,72 @@ +/* Usage: + +```bash + ./bin/test-coarse-encoder ../ggml_weights/ +``` +*/ +#include +#include +#include + +#include "bark.h" + +const int n_threads = 4; +const bark_verbosity_level verbosity = bark_verbosity_level::MEDIUM; + +const bark_sequence semantic_tokens = { + 1913, 8020, 8572, 8572, 1722, 59, 28, 28, 28, 8606, 7695, 7695, 6948, 9488, 92, 28, 107, 9296, 4093, 1640, 1449, 50, 1079, 441, 10, 41, 8275, 847, 8396, 8396, 6747, 7656, 2049, 7656, 5156, 5156, 8865, 178, 50, 178, 1015, 441, 10, 41, 3451, 5737, 2563, 3354, 4382, 734, 4683, 827, 396, 50, 10, 27, 27, 8093, 7401, 937, 937, 937, 259, 2066, 4485, 1385, 1385, 4, 4, 1385, 7588, 660, 252, 252, 252, 663, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96 // this is a dog barking. +}; +const bark_sequence semantic_tokens_2 = { + 10, 5785, 10, 6043, 6043, 6043, 6043, 6043, 6043, 4019, 8137, 4166, 5832, 7803, 8010, 8010, 8010, 6174, 6174, 741, 741, 6592, 741, 441, 10, 783, 206, 206, 206, 10, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 206, 344 , 10, 65, 344, 147, 55, 10, 57, 57, 7882, 6863, 6863, 4298, 9111, 9111 , 5862, 5862, 5862, 3741, 657, 120, 171, 2895, 741, 6750, 6750, 10, 2330, 2795 , 2795, 5131, 5131, 2415, 2415, 2130, 880, 8634, 59, 28, 28, 28, 28, 28 , 28, 28, 28, 28, 1133, 5586, 8607, 6799, 4156, 4156, 1177, 326, 326, 741 , 232, 100, 6401, 3670, 5899, 8266, 8266, 4825, 8522, 4323, 7298, 7298, 26, 26 , 26, 1262, 3705, 985, 6844, 441, 441, 5555, 10, 2690, 8428, 10, 985, 7010 , 147, 147, 2907, 2907, 59, 28, 28, 28, 28, 107, 1310, 8968, 8968, 9366 , 9366, 1732, 1732, 1732, 373, 9263, 4480, 4480, 4480, 4480, 4236, 1285, 1285, 1285 , 1285, 1285, 1285, 1285, 1285, 1285, 2997, 2997, 6662, 3761, 3761, 1003, 9293, 83 , 83, 20, 2881, 4978, 5457, 602, 147, 5457, 5457, 5457, 10, 7309, 147, 147 , 147, 217, 8934, 9046, 9510, 9510, 9510, 956, 956, 2320, 2320, 7283, 3088, 3212 , 1152, 3212, 122, 59, 28, 28, 28, 107, 107, 28, 107, 107, 223, 223 , 223, 223, 2784, 206, 230, 206, 1710, 602, 10, 5092, 9862, 10, 55, 206 , 193, 147, 193, 206, 206, 4374, 206, 206, 517, 206, 206, 10, 1278, 1278 , 2089, 147, 147, 10, 57, 604, 7882, 6863, 6863, 4298, 9111, 9111, 5862, 5862 , 9516, 3741, 3599, 120, 120, 1443, 8627, 7274, 1025, 10, 10, 6356, 1878, 8485 , 6703, 8922, 5951, 3506, 2237, 9218, 9218, 4977, 1697, 1697, 3599, 232, 1606, 1620 , 10, 99, 401, 6236, 3573, 9090, 9090, 9090, 298, 128, 5794, 8099, 7610, 389 , 9944, 823, 9456, 9456, 4238, 4238, 3645, 288, 120, 298, 5546, 2921, 2921, 6076 , 3937, 4909, 3937, 6501, 6501, 441, 10, 245, 3623, 3493, 2846, 9056, 9056, 3361 , 1112, 2180, 741, 211, 211, 10, 402, 10, 8934, 2673, 2389, 2389, 4382, 734 , 734, 4683, 9935, 5771, 7901, 232, 232, 10, 27, 27, 3971, 4089, 8844, 6750 , 441, 441, 10, 100, 6200, 3158, 8396, 8396, 8396, 2069, 557, 557, 7901, 741 , 256, 2430, 59, 28, 28, 28, 107, 7883, 6027, 3182, 3182, 3755, 208, 208 , 2462, 232, 10, 6401, 4747, 9818, 7557, 7557, 7557, 208, 208, 5327, 2462, 441 , 10, 10, 41, 4942, 8022, 8022, 8726, 6664, 8726, 8522, 3767, 3767, 3767, 4775 , 6133, 281, 3374, 8376, 8376, 3374, 8376, 441, 8376, 763, 5092, 10, 56, 230 , 56, 230, 147, 206, 206, 206, 206, 5199, 206, 206, 206, 206, 206, 206 , 206, 206, 206, 206, 206, 147, 3252, 206, 91, 2966, 55, 1278, 147, 147 , 147, 55, 3961, 147, 147, 147, 302, 6356, 6513, 6513, 6513, 6513, 6513, 6513 , 6513, 421, 421, 4925, 4925, 4925, 4925, 4925, 7813, 7813, 7813, 1430, 8634, 8811 , 59, 59, 28, 28, 107, 6467, 9569, 5920, 9124, 9124, 5481, 5481, 2507, 2507 , 9921, 422, 215, 215, 215, 6123, 6123, 5916, 5916, 5916, 8184, 4698, 7900, 7900 , 7900, 7900, 664, 749, 278, 749, 749, 10, 5457, 602, 5457, 5457, 147, 55 , 7309, 10, 2330, 3540, 8772, 1430, 1430, 985, 441, 10, 1532, 2384, 8536, 5187 , 8869, 6105, 6105, 6105, 5313, 1471, 1471, 9935, 3561, 1242, 232, 100, 10, 27 , 4168, 4168, 4286, 8634, 8634, 207, 28, 28, 28, 254, 9569, 9569, 5920, 9124 , 5481, 5481, 2507, 2507, 441, 10, 329, 195, 1136, 1136, 3619, 5131, 5131, 1662 , 2415, 741, 10, 5026, 6043, 6043, 9662, 9662, 9002, 7857, 7857, 4786, 4786, 4323 , 4323, 26, 26, 26, 26, 2451, 2451, 10, 10, 266, 206, 206, 206, 206 , 206, 206, 206, 206, 206, 206, 206, 65, 206, 206, 206, 65, 344, 55 , 344, 147, 147, 10, 57, 302, 2201, 2201, 2201, 2201, 5411, 5411, 4554, 7714 , 7714, 2580, 1025, 1025, 1025, 7710, 1973, 1973, 535, 321 +}; + +std::vector > transpose(const std::vector > data) { + // this assumes that all inner vectors have the same size and + // allocates space for the complete result in advance + std::vector > result(data[0].size(), + std::vector(data.size())); + for (std::vector::size_type i = 0; i < data[0].size(); i++) + for (std::vector::size_type j = 0; j < data.size(); j++) { + result[i][j] = data[j][i]; + } + return result; +} + + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const std::string weights_dir = argv[1]; + + // initialize bark context + struct bark_context * bctx = bark_load_model(weights_dir.c_str(), verbosity); + if (!bctx) { + fprintf(stderr, "%s: Could not load model\n", __func__); + exit(1); + } + + bctx->semantic_tokens = semantic_tokens_2; + + // generate coarse tokens + if (!bark_forward_coarse_encoder(bctx, n_threads, verbosity)) { + fprintf(stderr, "%s: failed to forward coarse encoder\n", __func__); + return 1; + } + + // print coarse tokens + fprintf(stderr, "shape of coarse tokens: [%zu, %zu]\n", bctx->coarse_tokens.size(), bctx->coarse_tokens[0].size()); + + bark_codes ct = transpose(bctx->coarse_tokens); + + for (size_t i = 0; i < ct.size(); i++) { + for (size_t j = 0; j < ct[i].size(); j++) { + fprintf(stderr, "%d ", ct[i][j]); + } + fprintf(stderr, "\n"); + } + + return 0; +} \ No newline at end of file diff --git a/bark/tests/test-fine-encoder.cpp b/bark/tests/test-fine-encoder.cpp new file mode 100644 index 0000000..78a2abf --- /dev/null +++ b/bark/tests/test-fine-encoder.cpp @@ -0,0 +1,70 @@ +/* Usage: + +```bash + ./bin/test-fine-encoder ../ggml_weights/ +``` +*/ +#include +#include +#include + +#include "bark.h" + +const int n_threads = 4; +const bark_verbosity_level verbosity = bark_verbosity_level::MEDIUM; + +const bark_codes coarse_tokens = { + { 395, 395, 395, 395, 475, 395, 475, 395, 395, 395, 395, 395, 819, 395, 395, 395, 395, 395, 395, 819, 819, 395, 395, 395, 395, 395, 395, 395, 395, 395, 537, 887, 537, 499, 835, 475, 404, 475, 395, 475, 855, 257, 475, 404, 779, 779, 395, 395, 23, 59, 881, 59, 901, 151, 860, 819, 819, 819, 373, 819, 819, 635, 1011, 373, 798, 819, 373, 819, 709, 819, 819, 819, 635, 323, 192, 901, 59, 942, 871, 208, 430, 604, 834, 430, 475, 475, 395, 475, 537, 233, 747, 428, 683, 112, 402, 216, 683, 112, 402, 216, 216, 99, 683, 112, 402, 216, 216, 683, 112, 428, 428, 690, 942, 871, 208, 228, 904, 404, 404, 499, 404, 475, 395, 475, 257, 835, 475, 475, 475, 395, 475, 257, 475, 475, 855, 887, 392, 216, 683, 112, 112, 402, 11, 11, 11, 323, 91, 904, 404, 855, 404, 779, 677, 475, 59, 59, 151, 276, 23, 276, 276, 347, 347, 879, 753, 325, 879, 1011, 753, 276, 276, 753, 276, 228, 855, 835, 475, 475, 475, 475, 106, 475, 395, 537, 835, 257, 404, 835, 475, 887, 475, 475, 475, 855, 475, 475, 475, 475, 475, 475, 475, 475, 475, 475 }, + { 969, 928, 928, 913, 928, 43, 424, 913, 518, 200, 200, 544, 544, 200, 200, 200, 424, 200, 424, 544, 969, 200, 964, 200, 913, 969, 544, 200, 200, 544, 646, 200, 913, 648, 969, 518, 544, 424, 913, 518, 424, 544, 913, 424, 544, 913, 913, 544, 73, 504, 591, 952, 591, 655, 1007, 429, 603, 857, 4, 857, 896, 1010, 504, 35, 955, 67, 4, 1010, 857, 857, 857, 857, 961, 964, 381, 955, 952, 955, 386, 403, 601, 961, 765, 544, 913, 424, 765, 424, 928, 453, 403, 505, 833, 478, 478, 478, 478, 478, 478, 478, 478, 95, 478, 478, 478, 478, 478, 478, 95, 663, 136, 386, 386, 891, 770, 896, 516, 937, 544, 747, 928, 969, 913, 424, 363, 424, 424, 424, 424, 646, 913, 544, 928, 424, 544, 463, 478, 185, 776, 300, 685, 685, 371, 663, 513, 105, 1007, 770, 1007, 969, 544, 964, 648, 519, 717, 591, 833, 364, 364, 105, 364, 770, 200, 364, 519, 519, 519, 519, 519, 745, 942, 519, 829, 928, 859, 937, 913, 424, 544, 424, 424, 518, 200, 648, 928, 544, 544, 424, 424, 646, 913, 424, 913, 544, 913, 913, 913, 518, 928, 913, 913, 913, 913, 518}, +}; + +std::vector > transpose(const std::vector > data) { + // this assumes that all inner vectors have the same size and + // allocates space for the complete result in advance + std::vector > result(data[0].size(), + std::vector(data.size())); + for (std::vector::size_type i = 0; i < data[0].size(); i++) + for (std::vector::size_type j = 0; j < data.size(); j++) { + result[i][j] = data[j][i]; + } + return result; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 1; + } + + const std::string weights_dir = argv[1]; + + // initialize bark context + struct bark_context * bctx = bark_load_model(weights_dir.c_str(), verbosity); + if (!bctx) { + fprintf(stderr, "%s: Could not load model\n", __func__); + exit(1); + } + + bctx->coarse_tokens = transpose(coarse_tokens); + + // generate fine tokens + if (!bark_forward_fine_encoder(bctx, n_threads, verbosity)) { + fprintf(stderr, "%s: failed to forward fine encoder\n", __func__); + return 1; + } + + // print fine tokens + fprintf(stderr, "shape of fine tokens: [%zu, %zu]\n", bctx->fine_tokens.size(), bctx->fine_tokens[0].size()); + + bark_codes ft = transpose(bctx->fine_tokens); + // bark_codes ft = bctx->fine_tokens; + + for (size_t i = 0; i < ft.size(); i++) { + for (size_t j = 0; j < ft[i].size(); j++) { + fprintf(stderr, "%d ", ft[i][j]); + } + fprintf(stderr, "\n"); + } + + return 0; +} \ No newline at end of file diff --git a/tests/test-tokenizer.cpp b/bark/tests/test-tokenizer.cpp similarity index 89% rename from tests/test-tokenizer.cpp rename to bark/tests/test-tokenizer.cpp index 7fb8973..0ab00f5 100644 --- a/tests/test-tokenizer.cpp +++ b/bark/tests/test-tokenizer.cpp @@ -1,19 +1,16 @@ +/* Usage: + +```bash + ./bin/test-tokenizer ../ggml_weights/ggml_vocab.bin +``` +*/ #include #include #include #include -#define BARK_API_INTERNAL #include "bark.h" -struct bark_vocab { - using id = int32_t; - using token = std::string; - - std::map token_to_id; - std::map id_to_token; -}; - static const std::map & k_tests() { static std::map _k_tests = { @@ -39,9 +36,9 @@ int main(int argc, char **argv) { bark_vocab vocab; int max_ctx_size = 256; - if (bark_vocab_load(fname.c_str(), &vocab, 119547) > 0) { + if (!bark_vocab_load(fname.c_str(), &vocab, 119547)) { fprintf(stderr, "%s: invalid vocab file '%s'\n", __func__, fname.c_str()); - return 1; + exit(1); } for (const auto & test_kv : k_tests()) { diff --git a/encodec.cpp b/encodec.cpp deleted file mode 100644 index eee3730..0000000 --- a/encodec.cpp +++ /dev/null @@ -1,535 +0,0 @@ -#include "encodec.h" -#include "ggml.h" -#include "bark-util.h" - -#include -#include -#include -#include -#include -#include - -static void encodec_sigmoid_impl(struct ggml_tensor * dst, const struct ggml_tensor * src, int ith, int nth, void * userdata) { - GGML_ASSERT(userdata == NULL); - GGML_ASSERT(ggml_are_same_shape(dst, src)); - GGML_ASSERT(ggml_is_contiguous(dst)); - GGML_ASSERT(ggml_is_contiguous(src)); - - const float * src_data = ggml_get_data_f32(src); - float * dst_data = ggml_get_data_f32(dst); - - const int ne = (int)ggml_nelements(dst); - const int dr = (ne + nth - 1) / nth; - const int ie0 = dr * ith; - const int ie1 = std::min(ie0 + dr, ne); - - for (int i = ie0; i < ie1; ++i) { - dst_data[i] = 1.0f / (1.0f + expf(-src_data[i])); - } -} - -static struct ggml_tensor * encodec_sigmoid(ggml_context * ctx, struct ggml_tensor * x) { - return ggml_map_custom1(ctx, x, encodec_sigmoid_impl, GGML_N_TASKS_MAX, NULL); -} - -static int get_extra_padding_for_conv_1d(ggml_tensor * inp, float kernel_size, float stride, float padding_total) { - float length = inp->ne[0]; - float n_frames = (length - kernel_size + padding_total) / stride + 1.0f; - int ideal_length = (ceilf(n_frames) - 1) * stride + (kernel_size - padding_total); - return ideal_length - length; -} - -static struct ggml_tensor * pad_1d(ggml_context * ctx0, ggml_tensor * inp, int padding_left, int padding_right) { - int length = inp->ne[0]; - int dim = inp->ne[1]; - - const int max_pad = std::max(padding_left, padding_right); - int extra_pad = 0; - - if (length <= max_pad) { - extra_pad = max_pad - length + 1; - - // constant padding - struct ggml_tensor * out = ggml_new_tensor_2d(ctx0, inp->type, length+extra_pad, dim); - ggml_set_zero(out); - out = ggml_set_2d(ctx0, out, inp, out->nb[1], 0); - } - - struct ggml_tensor * padded = ggml_pad_reflec_1d(ctx0, inp, padding_left, padding_right); - - const int end = padded->ne[0] - extra_pad; - struct ggml_tensor *dest = ggml_view_2d(ctx0, padded, end, dim, padded->nb[1], 0); - - return dest; -} - -static struct ggml_tensor * unpad_1d(ggml_context * ctx0, ggml_tensor * inp, int padding_left, int padding_right) { - int length = inp->ne[0]; - int dim = inp->ne[1]; - - ENCODEC_ASSERT(padding_left >= 0); - ENCODEC_ASSERT(padding_right >= 0); - ENCODEC_ASSERT(padding_left + padding_right <= length); - - int end = length - padding_right; - - int offset = padding_left * inp->nb[1]; - struct ggml_tensor * dst = ggml_view_2d(ctx0, inp, end, dim, inp->nb[1], offset); - - return dst; -} - -static struct ggml_tensor * forward_pass_lstm_unilayer( - struct ggml_context * ctx0, - struct ggml_tensor * inp, - struct ggml_tensor * weight_ih, - struct ggml_tensor * weight_hh, - struct ggml_tensor * bias_ih, - struct ggml_tensor * bias_hh) { - - const int input_dim = inp->ne[1]; - const int hidden_dim = weight_ih->ne[1]/4; - const int seq_length = inp->ne[0]; - - struct ggml_tensor * hs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_dim, seq_length); - - struct ggml_tensor * c_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim); - struct ggml_tensor * h_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim); - - h_t = ggml_set_zero(h_t); - c_t = ggml_set_zero(c_t); - - struct ggml_tensor * current = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); - - for (int t = 0; t < seq_length; t++) { - struct ggml_tensor * x_t = ggml_view_1d(ctx0, current, input_dim, t*current->nb[1]); - - struct ggml_tensor * inp_gates = ggml_mul_mat(ctx0, weight_ih, x_t); - inp_gates = ggml_add(ctx0, inp_gates, bias_ih); - - struct ggml_tensor * hid_gates = ggml_mul_mat(ctx0, weight_hh, h_t); - hid_gates = ggml_add(ctx0, hid_gates, bias_hh); - - struct ggml_tensor * out_gates = ggml_add(ctx0, inp_gates, hid_gates); - - struct ggml_tensor * i_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 0*sizeof(float)*hidden_dim)); - struct ggml_tensor * f_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 1*sizeof(float)*hidden_dim)); - struct ggml_tensor * g_t = ggml_tanh (ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 2*sizeof(float)*hidden_dim)); - struct ggml_tensor * o_t = encodec_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 3*sizeof(float)*hidden_dim)); - - c_t = ggml_add(ctx0, ggml_mul(ctx0, f_t, c_t), ggml_mul(ctx0, i_t, g_t)); - h_t = ggml_mul(ctx0, o_t, ggml_tanh(ctx0, c_t)); - - hs = ggml_set_1d(ctx0, hs, h_t, t*hs->nb[1]); - } - - hs = ggml_cont(ctx0, ggml_transpose(ctx0, hs)); - - return hs; -} - -static struct ggml_tensor * strided_conv_1d( - ggml_context * ctx0, - ggml_tensor * inp, - ggml_tensor * conv_w, - ggml_tensor * conv_b, - int stride) { - int kernel_size = conv_w->ne[0]; - int padding_total = kernel_size - stride; - int extra_padding = get_extra_padding_for_conv_1d(inp, kernel_size, stride, padding_total); - - struct ggml_tensor * padded_inp = pad_1d(ctx0, inp, padding_total, extra_padding); - struct ggml_tensor * dst = ggml_conv_1d(ctx0, conv_w, padded_inp, stride, 0, 1); - - // add bias - dst = ggml_transpose(ctx0, dst); - dst = ggml_add(ctx0, ggml_repeat(ctx0, conv_b, dst), dst); - dst = ggml_cont(ctx0, ggml_transpose(ctx0, dst)); - - return dst; -} - -static struct ggml_tensor * strided_conv_transpose_1d( - ggml_context * ctx0, - ggml_tensor * inp, - ggml_tensor * conv_w, - ggml_tensor * conv_b, - int stride) { - int kernel_size = conv_w->ne[0]; - int padding_total = kernel_size - stride; - - struct ggml_tensor * dst = ggml_conv_transpose_1d(ctx0, conv_w, inp, stride, 0, 1); - - // add bias - dst = ggml_transpose(ctx0, dst); - dst = ggml_add(ctx0, ggml_repeat(ctx0, conv_b, dst), dst); - dst = ggml_cont(ctx0, ggml_transpose(ctx0, dst)); - - int padding_right = ceilf(padding_total); - int padding_left = padding_total - padding_right; - - struct ggml_tensor * unpadded = unpad_1d(ctx0, dst, padding_left, padding_right); - unpadded = ggml_cont(ctx0, unpadded); - - return unpadded; -} - -int encodec_model_load(const std::string& fname, encodec_model& model) { - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return 1; - } - - // verify magic (i.e. ggml signature in hex format) - { - uint32_t magic; - read_safe(fin, magic); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return 1; - } - } - - auto & ctx = model.ctx; - size_t ctx_size = 0; - - // Evaluating context size - { - const auto & hparams = model.hparams; - - const int in_channels = hparams.in_channels; - const int hidden_dim = hparams.hidden_dim; - const int n_filters = hparams.n_filters; - const int kernel_size = hparams.kernel_size; - const int res_kernel_sz = hparams.residual_kernel_size; - const int n_q = hparams.n_q; - const int n_bins = hparams.n_bins; - const int *ratios = hparams.ratios; - - // decoder - { - // initial conv1d layer - ctx_size += in_channels*n_filters*kernel_size*ggml_type_size(GGML_TYPE_F32); // weight - ctx_size += n_filters*ggml_type_size(GGML_TYPE_F32); //bias - - int mult = 1; // scaling factor for hidden size - - for (int i = 0; i < 4; i++) { - // conv1 - ctx_size += res_kernel_sz*(mult*n_filters)*(mult*n_filters/2)*ggml_type_size(GGML_TYPE_F32); // weight - ctx_size += (mult*n_filters/2)*ggml_type_size(GGML_TYPE_F32); // bias - - // conv2 - ctx_size += (mult*n_filters/2)*(mult*n_filters)*ggml_type_size(GGML_TYPE_F32); - ctx_size += (mult*n_filters)*ggml_type_size(GGML_TYPE_F32); - - // shortcut conv - ctx_size += (mult*n_filters)*(mult*n_filters)*ggml_type_size(GGML_TYPE_F32); - ctx_size += (mult*n_filters)*ggml_type_size(GGML_TYPE_F32); - - // downsampling blocks - ctx_size += (2*ratios[i])*(mult*n_filters)*(mult*n_filters*2)*ggml_type_size(GGML_TYPE_F32); - ctx_size += (mult*n_filters*2)*ggml_type_size(GGML_TYPE_F32); - - mult *= 2; - } - - // lstm - { - // l0_ih, l0_hh, l1_ih, l1_hh all have the same shapes, hence 4 - ctx_size += 4*(mult*n_filters)*(4*mult*n_filters)*ggml_type_size(GGML_TYPE_F32); // weight - ctx_size += 4*(4*mult*n_filters)*ggml_type_size(GGML_TYPE_F32); // bias - } - - // final conv - ctx_size += kernel_size*(mult*n_filters)*hidden_dim*ggml_type_size(GGML_TYPE_F32); - ctx_size += hidden_dim*ggml_type_size(GGML_TYPE_F32); - } - - // quantizer - { - ctx_size += n_q*hidden_dim*n_bins; // embed - } - - ctx_size += 10ull*MB; // object overhead - } - - // create the ggml context - { - struct ggml_init_params params = { - /* .mem_size = */ ctx_size, - /* .mem_buffer = */ NULL, - /* .no_alloc = */ false, - }; - - model.ctx = ggml_init(params); - if(!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return 1; - } - } - - // prepare memory for the weights - { - const auto & hparams = model.hparams; - - const int in_channels = hparams.in_channels; - const int hidden_dim = hparams.hidden_dim; - const int n_filters = hparams.n_filters; - const int kernel_size = hparams.kernel_size; - const int res_kernel_sz = hparams.residual_kernel_size; - const int n_q = hparams.n_q; - const int *ratios = hparams.ratios; - const int n_bins = hparams.n_bins; - - // decoder - { - model.decoder.blocks.resize(4); - - int mult = 16; // 2**len(ratios) - - model.decoder.init_conv_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, kernel_size, hidden_dim, mult*n_filters); - model.decoder.init_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mult*n_filters); - - model.tensors["decoder.model.0.conv.conv.weight"] = model.decoder.init_conv_w; - model.tensors["decoder.model.0.conv.conv.bias"] = model.decoder.init_conv_b; - - // LSTM - model.decoder.lstm.l0_ih_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mult*n_filters, 4*mult*n_filters); - model.decoder.lstm.l1_ih_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mult*n_filters, 4*mult*n_filters); - - model.tensors["decoder.model.1.lstm.weight_ih_l0"] = model.decoder.lstm.l0_ih_w; - model.tensors["decoder.model.1.lstm.weight_ih_l1"] = model.decoder.lstm.l1_ih_w; - - model.decoder.lstm.l0_hh_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mult*n_filters, 4*mult*n_filters); - model.decoder.lstm.l1_hh_w = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, mult*n_filters, 4*mult*n_filters); - - model.tensors["decoder.model.1.lstm.weight_hh_l0"] = model.decoder.lstm.l0_hh_w; - model.tensors["decoder.model.1.lstm.weight_hh_l1"] = model.decoder.lstm.l1_hh_w; - - model.decoder.lstm.l0_ih_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*mult*n_filters); - model.decoder.lstm.l1_ih_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*mult*n_filters); - - model.tensors["decoder.model.1.lstm.bias_ih_l0"] = model.decoder.lstm.l0_ih_b; - model.tensors["decoder.model.1.lstm.bias_ih_l1"] = model.decoder.lstm.l1_ih_b; - - model.decoder.lstm.l0_hh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*mult*n_filters); - model.decoder.lstm.l1_hh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*mult*n_filters); - - model.tensors["decoder.model.1.lstm.bias_hh_l0"] = model.decoder.lstm.l0_hh_b; - model.tensors["decoder.model.1.lstm.bias_hh_l1"] = model.decoder.lstm.l1_hh_b; - - for (int i = 0; i < 4; i++) { - // upsampling - model.decoder.blocks[i].us_conv_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, ratios[i]*2, mult*n_filters/2, mult*n_filters); - model.decoder.blocks[i].us_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mult*n_filters/2); - - model.tensors["decoder.model." + std::to_string(3*(i+1)) + ".convtr.convtr.weight"] = model.decoder.blocks[i].us_conv_w; - model.tensors["decoder.model." + std::to_string(3*(i+1)) + ".convtr.convtr.bias"] = model.decoder.blocks[i].us_conv_b; - - // conv1 - model.decoder.blocks[i].conv_1_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, res_kernel_sz, mult*n_filters/2, mult*n_filters/4); - model.decoder.blocks[i].conv_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mult*n_filters/4); - - model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".block.1.conv.conv.weight"] = model.decoder.blocks[i].conv_1_w; - model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".block.1.conv.conv.bias"] = model.decoder.blocks[i].conv_1_b; - - // conv2 - model.decoder.blocks[i].conv_2_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, mult*n_filters/4, mult*n_filters/2); - model.decoder.blocks[i].conv_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mult*n_filters/2); - - model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".block.3.conv.conv.weight"] = model.decoder.blocks[i].conv_2_w; - model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".block.3.conv.conv.bias"] = model.decoder.blocks[i].conv_2_b; - - // shortcut - model.decoder.blocks[i].conv_sc_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 1, mult*n_filters/2, mult*n_filters/2); - model.decoder.blocks[i].conv_sc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, mult*n_filters/2); - - model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".shortcut.conv.conv.weight"] = model.decoder.blocks[i].conv_sc_w; - model.tensors["decoder.model." + std::to_string(3*(i+1)+1) + ".shortcut.conv.conv.bias"] = model.decoder.blocks[i].conv_sc_b; - - mult /= 2; - } - - model.decoder.final_conv_w = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, kernel_size, n_filters, in_channels); - model.decoder.final_conv_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels); - - model.tensors["decoder.model.15.conv.conv.weight"] = model.decoder.final_conv_w; - model.tensors["decoder.model.15.conv.conv.bias"] = model.decoder.final_conv_b; - } - - // quantizer - { - model.quantizer.blocks.resize(n_q); - for (int i = 0; i < n_q; i++) { - model.quantizer.blocks[i].embed = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, hidden_dim, n_bins); - model.tensors["quantizer.vq.layers." + std::to_string(i) + "._codebook.embed"] = model.quantizer.blocks[i].embed; - } - } - - } - - // load weights - { - size_t total_size = 0; - model.n_loaded = 0; - - while(true) { - int32_t n_dims; - int32_t length; - int32_t ftype; - - read_safe(fin, n_dims); - read_safe(fin, length); - read_safe(fin, ftype); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[3] = {1, 1, 1}; - for (int i = 0; i < n_dims; i++) { - read_safe(fin, ne[i]); - nelements *= ne[i]; - } - - std::string name; - std::vector buf(length); - fin.read(&buf[0], buf.size()); - name.assign(&buf[0], buf.size()); - - if (model.tensors.find(name.data()) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); - return 1; - } - - auto tensor = model.tensors[name.data()]; - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); - return 1; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld, %lld], expected [%d, %d, %d]\n", - __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]); - return 1; - } - - const size_t bpe = ggml_type_size(ggml_type(ftype)); - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); - return 1; - } - - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - - // printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); - - total_size += ggml_nbytes(tensor); - model.n_loaded++; - } - - fprintf(stderr, "%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0); - } - - fin.close(); - - return 0; -} - -struct ggml_tensor * encodec_quantizer_decode_eval( - struct ggml_context * ctx0, - const encodec_model & model, - struct ggml_tensor * codes) { - // codes: [seq_length, n_codes] - const int hidden_dim = model.hparams.hidden_dim; - const int seq_length = codes->ne[0]; - const int n_q = codes->ne[1]; - - struct ggml_tensor * quantized_out = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_dim, seq_length); - quantized_out = ggml_set_zero(quantized_out); - - for (int i = 0; i < n_q; i++) { - encodec_quant_block block = model.quantizer.blocks[i]; - - struct ggml_tensor * indices = ggml_view_1d(ctx0, codes, seq_length, i*codes->nb[1]); - struct ggml_tensor * quantized = ggml_get_rows(ctx0, block.embed, indices); - - quantized_out = ggml_add(ctx0, quantized_out, quantized); - } - - quantized_out = ggml_cont(ctx0, ggml_transpose(ctx0, quantized_out)); - - return quantized_out; -} - -struct ggml_tensor * encodec_decoder_eval( - struct ggml_context * ctx0, - const encodec_model & model, - struct ggml_tensor * quantized_out) { - const auto & hparams = model.hparams; - const int * ratios = hparams.ratios; - const int stride = hparams.stride; - - struct ggml_tensor * inpL = strided_conv_1d( - ctx0, quantized_out, model.decoder.init_conv_w, model.decoder.init_conv_b, stride); - - // lstm - { - struct ggml_tensor * cur = inpL; - - const encodec_lstm lstm = model.decoder.lstm; - - // first lstm layer - struct ggml_tensor * hs1 = forward_pass_lstm_unilayer( - ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b); - - // second lstm layer - struct ggml_tensor * out = forward_pass_lstm_unilayer( - ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b); - - inpL = ggml_add(ctx0, inpL, out); - } - - for (int layer_ix = 0; layer_ix < 4; layer_ix++) { - encodec_decoder_block block = model.decoder.blocks[layer_ix]; - - // upsampling layers - inpL = ggml_elu(ctx0, inpL); - - inpL = strided_conv_transpose_1d( - ctx0, inpL, block.us_conv_w, block.us_conv_b, ratios[layer_ix]); - - struct ggml_tensor * current = inpL; - - // shortcut - struct ggml_tensor * shortcut = strided_conv_1d( - ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride); - - // conv1 - current = ggml_elu(ctx0, current); - - current = strided_conv_1d( - ctx0, current, block.conv_1_w, block.conv_1_b, stride); - - // conv2 - current = ggml_elu(ctx0, current); - - current = strided_conv_1d( - ctx0, current, block.conv_2_w, block.conv_2_b, stride); - - // residual connection - inpL = ggml_add(ctx0, current, shortcut); - } - - // final conv - inpL = ggml_elu(ctx0, inpL); - - struct ggml_tensor * output = strided_conv_1d( - ctx0, inpL, model.decoder.final_conv_w, model.decoder.final_conv_b, stride); - - return output; -} diff --git a/encodec.cpp b/encodec.cpp new file mode 160000 index 0000000..e50cd96 --- /dev/null +++ b/encodec.cpp @@ -0,0 +1 @@ +Subproject commit e50cd96d28c89f6c1343c291042b14bab6f3b83b diff --git a/encodec.h b/encodec.h deleted file mode 100644 index d19af6d..0000000 --- a/encodec.h +++ /dev/null @@ -1,142 +0,0 @@ -/* This is a shortened version of the original Encodec.CPP here: https://github.com/PABannier/encodec.cpp. -Only the decoding quantizer and decoder part is implemented in this file. -*/ -#pragma once - -#include "ggml.h" - -#include -#include -#include -#include -#include -#include -#include - -#define ENCODEC_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "ENCODEC_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - -struct encodec_hparams { - int32_t in_channels = 1; - int32_t hidden_dim = 128; - int32_t n_filters = 32; - int32_t ratios[4] = {8, 5, 4, 2}; - int32_t kernel_size = 7; - int32_t residual_kernel_size = 3; - int32_t compress = 2; - int32_t n_lstm_layers = 2; - int32_t stride = 1; - - // 24kbps (n_q=32) - int32_t n_q = 32; - int32_t n_bins = 1024; - int32_t sr = 24000; -}; - -// res + downsample block at some ratio -struct encodec_encoder_block { - // conv1 - struct ggml_tensor * conv_1_w; - struct ggml_tensor * conv_1_b; - - // conv2 - struct ggml_tensor * conv_2_w; - struct ggml_tensor * conv_2_b; - - // shortcut - struct ggml_tensor * conv_sc_w; - struct ggml_tensor * conv_sc_b; - - // downsampling layers - struct ggml_tensor * ds_conv_w; - struct ggml_tensor * ds_conv_b; -}; - -struct encodec_lstm { - struct ggml_tensor * l0_ih_w; - struct ggml_tensor * l0_hh_w; - - struct ggml_tensor * l0_ih_b; - struct ggml_tensor * l0_hh_b; - - struct ggml_tensor * l1_ih_w; - struct ggml_tensor * l1_hh_w; - - struct ggml_tensor * l1_ih_b; - struct ggml_tensor * l1_hh_b; -}; - -struct encodec_quant_block { - struct ggml_tensor * embed; -}; - -struct encodec_quantizer { - std::vector blocks; -}; - -struct encodec_decoder_block { - //upsampling layers - struct ggml_tensor * us_conv_w; - struct ggml_tensor * us_conv_b; - - // conv1 - struct ggml_tensor * conv_1_w; - struct ggml_tensor * conv_1_b; - - // conv2 - struct ggml_tensor * conv_2_w; - struct ggml_tensor * conv_2_b; - - // shortcut - struct ggml_tensor * conv_sc_w; - struct ggml_tensor * conv_sc_b; -}; - -struct encodec_decoder { - struct ggml_tensor * init_conv_w; - struct ggml_tensor * init_conv_b; - - encodec_lstm lstm; - - struct ggml_tensor * final_conv_w; - struct ggml_tensor * final_conv_b; - - std::vector blocks; -}; - -struct encodec_model { - encodec_hparams hparams; - - encodec_quantizer quantizer; - encodec_decoder decoder; - - // context - struct ggml_context * ctx; - int n_loaded; - - std::map tensors; - - int64_t t_predict_us = 0; - int64_t t_main_us = 0; - - int64_t memsize = 0; - size_t mem_per_token = 0; -}; - - -int encodec_model_load(const std::string& fname, encodec_model& model); - -struct ggml_tensor * encodec_quantizer_decode_eval( - struct ggml_context * ctx0, - const encodec_model & model, - struct ggml_tensor * codes); - -struct ggml_tensor * encodec_decoder_eval( - struct ggml_context * ctx0, - const encodec_model & model, - struct ggml_tensor * quantized_out); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt deleted file mode 100644 index d167621..0000000 --- a/examples/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) - -add_subdirectory(main) -add_subdirectory(server) -add_subdirectory(quantize) diff --git a/examples/main/main.cpp b/examples/main/main.cpp deleted file mode 100644 index 7a1dab9..0000000 --- a/examples/main/main.cpp +++ /dev/null @@ -1,123 +0,0 @@ -#include "ggml.h" -#include "bark.h" - -#include - -struct bark_params { - int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - - // user prompt - std::string prompt = "this is an audio"; - - // paths - std::string model_path = "./ggml_weights"; - std::string dest_wav_path = "output.wav"; - - int32_t seed = 0; -}; - -void bark_print_usage(char ** argv, const bark_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -s N, --seed N seed for random number generator (default: %d)\n", params.seed); - fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stderr, " prompt to start generation with (default: random)\n"); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model_path.c_str()); - fprintf(stderr, " -o FNAME, --outwav FNAME\n"); - fprintf(stderr, " output generated wav (default: %s)\n", params.dest_wav_path.c_str()); - fprintf(stderr, "\n"); -} - -int bark_params_parse(int argc, char ** argv, bark_params & params) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - - if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(argv[++i]); - } else if (arg == "-p" || arg == "--prompt") { - params.prompt = argv[++i]; - } else if (arg == "-m" || arg == "--model") { - params.model_path = argv[++i]; - } else if (arg == "-s" || arg == "--seed") { - params.seed = std::stoi(argv[++i]); - } else if (arg == "-o" || arg == "--outwav") { - params.dest_wav_path = argv[++i]; - } else if (arg == "-h" || arg == "--help") { - bark_print_usage(argv, params); - exit(0); - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - bark_print_usage(argv, params); - exit(0); - } - } - - return 0; -} - -std::tuple bark_init_from_params(bark_params & params) { - bark_model * model = bark_load_model_from_file(params.model_path.c_str()); - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model_path.c_str()); - return std::make_tuple(nullptr, nullptr); - } - - bark_context_params bctx_params = bark_context_default_params(); - bark_context * bctx = bark_new_context_with_model(model, bctx_params); - if (bctx == NULL) { - fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model_path.c_str()); - bark_free_model(model); - return std::make_tuple(nullptr, nullptr); - } - - return std::make_tuple(model, bctx); -} - -int main(int argc, char **argv) { - ggml_time_init(); - const int64_t t_main_start_us = ggml_time_us(); - - bark_params params; - - if (bark_params_parse(argc, argv, params) > 0) { - fprintf(stderr, "%s: Could not parse arguments\n", __func__); - return 1; - } - - int64_t t_load_us = 0; - int64_t t_eval_us = 0; - - bark_context * bctx; - bark_model * model; - - // load the model - const int64_t t_start_us = ggml_time_us(); - std::tie(model, bctx) = bark_init_from_params(params); - t_load_us = ggml_time_us() - t_start_us; - - printf("\n"); - - bark_seed_rng(bctx, params.seed); - - const int64_t t_eval_us_start = ggml_time_us(); - bark_generate_audio(bctx, params.prompt.data(), params.dest_wav_path.c_str(), params.n_threads); - t_eval_us = ggml_time_us() - t_eval_us_start; - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: eval time = %8.2f ms\n", __func__, t_eval_us/1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - bark_free(bctx); - - return 0; -} diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt deleted file mode 100644 index d0cb815..0000000 --- a/examples/quantize/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -set(TARGET quantize) - -add_executable(${TARGET} quantize.cpp) - -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE bark.cpp ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(${TARGET} PRIVATE cxx_std_11) - -if(MSVC) - target_compile_definitions(${TARGET} PRIVATE -D_CRT_SECURE_NO_WARNINGS=1) -endif() \ No newline at end of file diff --git a/ggml b/ggml deleted file mode 160000 index a16b01d..0000000 --- a/ggml +++ /dev/null @@ -1 +0,0 @@ -Subproject commit a16b01d6891fd885800988003d53755c9574c6e4 diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake deleted file mode 100644 index 5023b77..0000000 --- a/scripts/build-info.cmake +++ /dev/null @@ -1,53 +0,0 @@ -set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.h.in") -set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") -set(BUILD_NUMBER 0) -set(BUILD_COMMIT "unknown") - -# Look for git -find_package(Git) -if(NOT Git_FOUND) - execute_process( - COMMAND which git - OUTPUT_VARIABLE GIT_EXECUTABLE - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - if(NOT GIT_EXECUTABLE STREQUAL "") - set(Git_FOUND TRUE) - message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}") - else() - message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.") - endif() -endif() - -# Get the commit count and hash -if(Git_FOUND) - execute_process( - COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE HEAD - OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE GIT_HEAD_RESULT - ) - execute_process( - COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE COUNT - OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE GIT_COUNT_RESULT - ) - if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0) - set(BUILD_COMMIT ${HEAD}) - set(BUILD_NUMBER ${COUNT}) - endif() -endif() - -# Only write the header if it's changed to prevent unnecessary recompilation -if(EXISTS ${HEADER_FILE}) - file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"") - list(GET CONTENTS 0 EXISTING) - if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"") - configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) - endif() -else() - configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) -endif() diff --git a/scripts/build-info.h.in b/scripts/build-info.h.in deleted file mode 100644 index 75d1e16..0000000 --- a/scripts/build-info.h.in +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef BUILD_INFO_H -#define BUILD_INFO_H - -#define BUILD_NUMBER @BUILD_NUMBER@ -#define BUILD_COMMIT "@BUILD_COMMIT@" - -#endif // BUILD_INFO_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt deleted file mode 100644 index e386832..0000000 --- a/tests/CMakeLists.txt +++ /dev/null @@ -1,17 +0,0 @@ -function(bark_add_test source) - get_filename_component(TEST_TARGET ${source} NAME_WE) - add_executable(${TEST_TARGET} ${source}) - install(TARGETS ${TEST_TARGET} RUNTIME) - target_link_libraries(${TEST_TARGET} PRIVATE bark.cpp ${CMAKE_THREAD_LIBS_INIT}) - target_compile_features(${TEST_TARGET} PRIVATE cxx_std_11) - add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) -endfunction() - -bark_add_test(test-tokenizer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_vocab.bin) -# bark_add_test(test-text-encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_text.bin) -# bark_add_test(test-coarse-encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_coarse.bin) -# bark_add_test(test-fine-encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_fine.bin) -# bark_add_test(test-forward-semantic.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_text.bin) -# bark_add_test(test-forward-coarse.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_text.bin) -# bark_add_test(test-forward-fine.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_fine.bin) -# bark_add_test(test-forward-encodec.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../ggml_weights/ggml_weights_encodec.bin) diff --git a/tests/data/coarse/test_pass_coarse_1.bin b/tests/data/coarse/test_pass_coarse_1.bin deleted file mode 100644 index 7612c6c..0000000 Binary files a/tests/data/coarse/test_pass_coarse_1.bin and /dev/null differ diff --git a/tests/data/coarse/test_pass_coarse_2.bin b/tests/data/coarse/test_pass_coarse_2.bin deleted file mode 100644 index 268792c..0000000 Binary files a/tests/data/coarse/test_pass_coarse_2.bin and /dev/null differ diff --git a/tests/data/coarse/test_pass_coarse_3.bin b/tests/data/coarse/test_pass_coarse_3.bin deleted file mode 100644 index 582382a..0000000 Binary files a/tests/data/coarse/test_pass_coarse_3.bin and /dev/null differ diff --git a/tests/data/fine/test_pass_fine_1.bin b/tests/data/fine/test_pass_fine_1.bin deleted file mode 100644 index 7dd2a1a..0000000 Binary files a/tests/data/fine/test_pass_fine_1.bin and /dev/null differ diff --git a/tests/data/fine/test_pass_fine_2.bin b/tests/data/fine/test_pass_fine_2.bin deleted file mode 100644 index 9e7c61e..0000000 Binary files a/tests/data/fine/test_pass_fine_2.bin and /dev/null differ diff --git a/tests/data/fine/test_pass_fine_3.bin b/tests/data/fine/test_pass_fine_3.bin deleted file mode 100644 index 34fff18..0000000 Binary files a/tests/data/fine/test_pass_fine_3.bin and /dev/null differ diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_1.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_1.bin deleted file mode 100644 index a75b57e..0000000 Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_1.bin and /dev/null differ diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_2.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_2.bin deleted file mode 100644 index 7bc5904..0000000 Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_2.bin and /dev/null differ diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_3.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_3.bin deleted file mode 100644 index b4472b0..0000000 Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_3.bin and /dev/null differ diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_4.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_4.bin deleted file mode 100644 index 82026e2..0000000 Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_4.bin and /dev/null differ diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_5.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_5.bin deleted file mode 100644 index 38fb523..0000000 Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_5.bin and /dev/null differ diff --git a/tests/data/fine_gpt_eval/test_fine_gpt_eval_6.bin b/tests/data/fine_gpt_eval/test_fine_gpt_eval_6.bin deleted file mode 100644 index 81f38c5..0000000 Binary files a/tests/data/fine_gpt_eval/test_fine_gpt_eval_6.bin and /dev/null differ diff --git a/tests/data/gpt_eval/test_gpt_eval_1_merge.bin b/tests/data/gpt_eval/test_gpt_eval_1_merge.bin deleted file mode 100644 index 1a86c5a..0000000 Binary files a/tests/data/gpt_eval/test_gpt_eval_1_merge.bin and /dev/null differ diff --git a/tests/data/gpt_eval/test_gpt_eval_1_no_merge.bin b/tests/data/gpt_eval/test_gpt_eval_1_no_merge.bin deleted file mode 100644 index 6349af1..0000000 Binary files a/tests/data/gpt_eval/test_gpt_eval_1_no_merge.bin and /dev/null differ diff --git a/tests/data/gpt_eval/test_gpt_eval_2_merge.bin b/tests/data/gpt_eval/test_gpt_eval_2_merge.bin deleted file mode 100644 index 4e954f3..0000000 Binary files a/tests/data/gpt_eval/test_gpt_eval_2_merge.bin and /dev/null differ diff --git a/tests/data/gpt_eval/test_gpt_eval_2_no_merge.bin b/tests/data/gpt_eval/test_gpt_eval_2_no_merge.bin deleted file mode 100644 index 7355c98..0000000 Binary files a/tests/data/gpt_eval/test_gpt_eval_2_no_merge.bin and /dev/null differ diff --git a/tests/data/gpt_eval/test_gpt_eval_3_merge.bin b/tests/data/gpt_eval/test_gpt_eval_3_merge.bin deleted file mode 100644 index fbed27b..0000000 Binary files a/tests/data/gpt_eval/test_gpt_eval_3_merge.bin and /dev/null differ diff --git a/tests/data/gpt_eval/test_gpt_eval_3_no_merge.bin b/tests/data/gpt_eval/test_gpt_eval_3_no_merge.bin deleted file mode 100644 index af57d05..0000000 Binary files a/tests/data/gpt_eval/test_gpt_eval_3_no_merge.bin and /dev/null differ diff --git a/tests/data/semantic/test_pass_semantic_1.bin b/tests/data/semantic/test_pass_semantic_1.bin deleted file mode 100644 index c13b44a..0000000 Binary files a/tests/data/semantic/test_pass_semantic_1.bin and /dev/null differ diff --git a/tests/data/semantic/test_pass_semantic_2.bin b/tests/data/semantic/test_pass_semantic_2.bin deleted file mode 100644 index 7738dbf..0000000 Binary files a/tests/data/semantic/test_pass_semantic_2.bin and /dev/null differ diff --git a/tests/data/semantic/test_pass_semantic_3.bin b/tests/data/semantic/test_pass_semantic_3.bin deleted file mode 100644 index b5bfd9b..0000000 Binary files a/tests/data/semantic/test_pass_semantic_3.bin and /dev/null differ diff --git a/tests/test-fine-gpt-eval.cpp b/tests/test-fine-gpt-eval.cpp deleted file mode 100644 index a7630dc..0000000 --- a/tests/test-fine-gpt-eval.cpp +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include - -#include "bark.h" -#include "common.h" - - -static const std::vector> test_args = { - { "./data/fine_gpt_eval/test_fine_gpt_eval_1.bin", 2 }, // prompt: Hello, my name is Suno. And, uh - and I like pizza. [laughs] But I also have other interests such as playing tic tac toe. - { "./data/fine_gpt_eval/test_fine_gpt_eval_2.bin", 3 }, // prompt: Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible. - { "./data/fine_gpt_eval/test_fine_gpt_eval_3.bin", 4 }, // prompt: ♪ In the jungle, the mighty jungle, the lion barks tonight ♪ - { "./data/fine_gpt_eval/test_fine_gpt_eval_4.bin", 5 }, // prompt: I have a silky smooth voice, and today I will tell you about the exercise regimen of the common sloth. - { "./data/fine_gpt_eval/test_fine_gpt_eval_5.bin", 6 }, // prompt: You cannot, my good sir, take that away from me without having me retaliate in the most ferocious way. - { "./data/fine_gpt_eval/test_fine_gpt_eval_6.bin", 7 }, // prompt: C’est un roc ! c’est un pic ! c’est un cap ! Que dis-je, c’est un cap ? C’est une péninsule ! -}; - -static const int n_threads = 4; - -template -std::vector flatten(std::vector> const &vec) { - std::vector flattened; - for (auto const &v: vec) { - flattened.insert(flattened.end(), v.begin(), v.end()); - } - return flattened; -} - -int main() { - const std::string fname = "../ggml_weights/ggml_weights_fine.bin"; - - gpt_model model; - if (gpt_model_load(fname, model) > 0) { - fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str()); - return 1; - } - - bark_codes tokens; - std::vector gt_logits, logits; - - // dry run to estimate mem_per_token - bark_sequence decoy = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 }; - fine_gpt_eval(model, decoy.data(), decoy.size(), nullptr, n_threads, 2); - - for (int i = 0; i < (int) test_args.size(); i++) { - std::string path = std::get<0>(test_args[i]); - int codebook_ix = std::get<1>(test_args[i]); - - tokens.clear(); - gt_logits.clear(); - logits.clear(); - - load_test_data(path, tokens, gt_logits); - tokens = transpose(tokens); - - std::vector tokens_vec = flatten(tokens); - - logits.resize(1024*1056); - fine_gpt_eval(model, tokens_vec.data(), tokens_vec.size(), logits.data(), n_threads, codebook_ix); - - printf("\n"); - printf("%s: %s\n", __func__, path.c_str()); - if (!run_test(gt_logits, logits)) { - printf("%s: test %d failed.\n", __func__, i+1); - } else { - printf("%s: test %d passed.\n", __func__, i+1); - } - } - - return 0; -} \ No newline at end of file diff --git a/tests/test-forward-coarse.cpp b/tests/test-forward-coarse.cpp deleted file mode 100644 index 75f7aa8..0000000 --- a/tests/test-forward-coarse.cpp +++ /dev/null @@ -1,60 +0,0 @@ -#include -#include -#include -#include - -#include "bark.h" -#include "common.h" - -static const std::vector test_data = { - "./data/coarse/test_pass_coarse_1.bin", // prompt: The amount of random conversations that lead to culture-shifting ideas is insane. - "./data/coarse/test_pass_coarse_2.bin", // prompt: Des Teufels liebstes Möbelstück ist die lange Bank - "./data/coarse/test_pass_coarse_3.bin", // prompt: खुदा ने बहुत सी अच्छी चीज बनाई है उस में एक हमारा दिमाग भी है बस उसे Use करने के लिए बता देता तो हम भी करोड़पति बन जाते I -}; - -static const int n_threads = 4; -static const int sliding_window_size = 60; -static const int max_coarse_history = 630; -static const float temp = 0.0f; - -int main() { - const std::string fname = "../ggml_weights/ggml_weights_coarse.bin"; - - std::mt19937 rng(0); - - bark_model model; - - if (gpt_model_load(fname, model.coarse_model) > 0) { - fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str()); - return 1; - } - - bark_context * ctx = bark_new_context_with_model(&model); - ctx->rng = rng; - - bark_sequence input; - bark_codes gt_tokens; - - for (int i = 0; i < (int) test_data.size(); i++) { - input.clear(); - gt_tokens.clear(); - - std::string path = test_data[i]; - load_test_data(path, input, gt_tokens); - ctx->semantic_tokens = input; - - bark_forward_coarse_encoder(ctx, max_coarse_history, sliding_window_size, temp, n_threads); - - printf("\n"); - printf("%s: %s\n", __func__, path.c_str()); - if (!run_test(transpose(gt_tokens), ctx->coarse_tokens)) { - printf("%s: test %d failed.\n", __func__, i+1); - } else { - printf("%s: test %d passed.\n", __func__, i+1); - } - } - - bark_free(ctx); - - return 0; -} diff --git a/tests/test-forward-encodec.cpp b/tests/test-forward-encodec.cpp deleted file mode 100644 index 5c9ee84..0000000 --- a/tests/test-forward-encodec.cpp +++ /dev/null @@ -1,47 +0,0 @@ -#include -#include -#include -#include - -#include "bark.h" -#include "common.h" - -static const std::vector test_data = { - "./data/encodec/test_pass_encodec_1.bin", // prompt: El hombre que se levanta es aún más grande que el que no ha caído. - "./data/encodec/test_pass_encodec_2.bin", // prompt: ♪ Heal the world, Make it a better place, For you and for me, and the entire human race ♪ - "./data/encodec/test_pass_encodec_3.bin", // prompt: En été, mieux vaut suer que trembler. -}; - -int main() { - const std::string fname = "../ggml_weights/ggml_weights_codec.bin"; - - encodec_model model; - if (encodec_model_load(fname, model) > 0) { - fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str()); - return 1; - } - - bark_codes tokens; - audio_arr_t gt_audio_arr, audio_arr; - - for (int i = 0; i < (int) test_data.size(); i++) { - tokens.clear(); - gt_audio_arr.clear(); - audio_arr.clear(); - - std::string path = test_data[i]; - load_test_data(path, tokens, gt_audio_arr); - - audio_arr_t audio_arr = bark_forward_encodec(transpose(tokens), model); - - printf("\n"); - printf("%s: %s\n", __func__, path.c_str()); - if (!run_test(gt_audio_arr, audio_arr)) { - printf("%s: test %d failed.\n", __func__, i+1); - } else { - printf("%s: test %d passed.\n", __func__, i+1); - } - } - - return 0; -} diff --git a/tests/test-forward-fine.cpp b/tests/test-forward-fine.cpp deleted file mode 100644 index 732f04e..0000000 --- a/tests/test-forward-fine.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include -#include - -#include "bark.h" -#include "common.h" - -static const std::vector test_data = { - "./data/fine/test_pass_fine_1.bin", // prompt: Peut-on savoir où s'arrête le normal, où commence l'anormal ? Vous pouvez définir ces notions, vous, normalité, anormalité ? - "./data/fine/test_pass_fine_2.bin", // prompt: Brevity is the soul of wit. - "./data/fine/test_pass_fine_3.bin", // prompt: El hombre que se levanta es aún más grande que el que no ha caído. -}; - -static const int n_threads = 4; -static const float temp = 0.0f; - -int main() { - const std::string fname = "../ggml_weights/ggml_weights_fine.bin"; - - std::mt19937 rng(0); - - bark_model model; - - if (gpt_model_load(fname, model.fine_model) > 0) { - fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str()); - return 1; - } - - bark_context * ctx = bark_new_context_with_model(&model); - ctx->rng = rng; - - bark_codes input, gt_tokens; - - for (int i = 0; i < (int) test_data.size(); i++) { - input.clear(); - gt_tokens.clear(); - - std::string path = test_data[i]; - load_test_data(path, input, gt_tokens); - - // TODO: need to remove transpose - ctx->coarse_tokens = transpose(input); - bark_forward_fine_encoder(ctx, temp, n_threads); - bark_codes tokens = transpose(ctx->fine_tokens); - - printf("\n"); - printf("%s: %s\n", __func__, path.c_str()); - if (!run_test(gt_tokens, tokens)) { - printf("%s: test %d failed.\n", __func__, i+1); - } else { - printf("%s: test %d passed.\n", __func__, i+1); - } - } - - bark_free(ctx); - - return 0; -} diff --git a/tests/test-forward-semantic.cpp b/tests/test-forward-semantic.cpp deleted file mode 100644 index 8cd2375..0000000 --- a/tests/test-forward-semantic.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include -#include - -#include "bark.h" -#include "common.h" - -static const std::vector test_data = { - "./data/semantic/test_pass_semantic_1.bin", // prompt: Ceci est un texte en français pour tester le bon fonctionnement de bark. - "./data/semantic/test_pass_semantic_2.bin", // prompt: Sometimes the heart sees what is invisible to the eye - "./data/semantic/test_pass_semantic_3.bin", // prompt: El Arte de Vencer se Aprende en las Derrotas -}; - -static const int n_threads = 4; -static const float min_eos_p = 0.2; -static const float temp = 0.0f; // deterministic sampling - -int main() { - const std::string fname = "../ggml_weights/ggml_weights_text.bin"; - - std::mt19937 rng(0); - - bark_model model; - - if (gpt_model_load(fname, model.text_model) > 0) { - fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str()); - return 1; - } - - bark_context * ctx = bark_new_context_with_model(&model); - ctx->rng = rng; - - bark_sequence input; - bark_sequence gt_tokens; - - for (int i = 0; i < (int) test_data.size(); i++) { - input.clear(); - gt_tokens.clear(); - - std::string path = test_data[i]; - load_test_data(path, input, gt_tokens); - ctx->tokens = input; - - bark_forward_text_encoder(ctx, temp, min_eos_p, n_threads); - - printf("\n"); - printf("%s: %s\n", __func__, path.c_str()); - if (!run_test(gt_tokens, ctx->semantic_tokens)) { - printf("%s: test %d failed.\n", __func__, i+1); - } else { - printf("%s: test %d passed.\n", __func__, i+1); - } - } - - bark_free(ctx); - - return 0; -} diff --git a/tests/test-gpt-eval.cpp b/tests/test-gpt-eval.cpp deleted file mode 100644 index 805755e..0000000 --- a/tests/test-gpt-eval.cpp +++ /dev/null @@ -1,65 +0,0 @@ -#include -#include - -#include "bark.h" -#include "common.h" - - -static const std::vector> test_args = { - { "./data/gpt_eval/test_gpt_eval_1_no_merge.bin", false }, // prompt: Hello, my name is Suno. And, uh - and I like pizza. [laughs] But I also have other interests such as playing tic tac toe. - { "./data/gpt_eval/test_gpt_eval_2_no_merge.bin", false }, // prompt: Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible. - { "./data/gpt_eval/test_gpt_eval_3_no_merge.bin", false }, // prompt: ♪ In the jungle, the mighty jungle, the lion barks tonight ♪ - - { "./data/gpt_eval/test_gpt_eval_1_merge.bin", true }, // prompt: I have a silky smooth voice, and today I will tell you about the exercise regimen of the common sloth. - { "./data/gpt_eval/test_gpt_eval_2_merge.bin", true }, // prompt: You cannot, my good sir, take that away from me without having me retaliate in the most ferocious way. - { "./data/gpt_eval/test_gpt_eval_3_merge.bin", true }, // prompt: Ceci est un texte en français pour tester le bon fonctionnement de bark. -}; - -static const int n_threads = 4; - -int main() { - const std::string fname = "../ggml_weights/ggml_weights_text.bin"; - - gpt_model model; - if (gpt_model_load(fname, model) > 0) { - fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str()); - return 1; - } - - bark_sequence tokens; - logit_sequence gt_logits, logits; - - auto & hparams = model.hparams; - int n_vocab = hparams.n_out_vocab; - logits.resize(n_vocab); - - // dry run to estimate mem_per_token - { - int n_past = 0; - bark_token decoy[4] = { 0, 1, 2, 3 }; - gpt_eval(model, decoy, 4, nullptr, &n_past, false, n_threads); - } - - for (int i = 0; i < (int) test_args.size(); i++) { - tokens.clear(); - gt_logits.clear(); - - std::string path = std::get<0>(test_args[i]); - bool merge_ctx = std::get<1>(test_args[i]); - - load_test_data(path, tokens, gt_logits); - - int n_past = 0; - gpt_eval(model, tokens.data(), tokens.size(), logits.data(), &n_past, merge_ctx, n_threads); - - printf("\n"); - printf("%s: %s\n", __func__, path.c_str()); - if (!run_test(gt_logits, logits)) { - printf("%s: test %d failed.\n", __func__, i+1); - } else { - printf("%s: test %d passed.\n", __func__, i+1); - } - } - - return 0; -} \ No newline at end of file