#include "encodec.h" #include #include #define CLS_TOKEN_ID 101 #define SEP_TOKEN_ID 102 #define TEXT_ENCODING_OFFSET 10048 #define TEXT_PAD_TOKEN 129595 #define CODEBOOK_SIZE 1024 #define N_COARSE_CODEBOOKS 2 #define N_FINE_CODEBOOKS 8 #define SEMANTIC_PAD_TOKEN 10000 #define SEMANTIC_INFER_TOKEN 129599 #define SEMANTIC_VOCAB_SIZE 10000 #define SEMANTIC_RATE_HZ 49.9 #define COARSE_RATE_HZ 75 #define COARSE_SEMANTIC_PAD_TOKEN 12048 #define COARSE_INFER_TOKEN 12050 struct gpt_hparams { int32_t n_in_vocab; int32_t n_out_vocab; int32_t n_layer; int32_t n_head; int32_t n_embd; int32_t block_size; int32_t n_lm_heads; int32_t n_wtes; int32_t n_codes_given = 1; }; struct bark_vocab { using id = int32_t; using token = std::string; std::map token_to_id; std::map id_to_token; std::map subword_token_to_id; std::map id_to_subword_token; }; struct gpt_layer { // normalization struct ggml_tensor * ln_1_g; struct ggml_tensor * ln_1_b; struct ggml_tensor * ln_2_g; struct ggml_tensor * ln_2_b; // attention struct ggml_tensor * c_attn_attn_w; struct ggml_tensor * c_attn_attn_b; struct ggml_tensor * c_attn_proj_w; struct ggml_tensor * c_attn_proj_b; // mlp struct ggml_tensor * c_mlp_fc_w; struct ggml_tensor * c_mlp_fc_b; struct ggml_tensor * c_mlp_proj_w; struct ggml_tensor * c_mlp_proj_b; }; struct gpt_model { gpt_hparams hparams; // normalization struct ggml_tensor * ln_f_g; struct ggml_tensor * ln_f_b; struct ggml_tensor * wpe; // token embedding // struct ggml_tensor * wte; // position embedding // struct ggml_tensor * lm_head; // language model head std::vector wtes; std::vector lm_heads; std::vector layers; // key + value memory struct ggml_tensor * memory_k; struct ggml_tensor * memory_v; // struct ggml_context * ctx; std::map tensors; int32_t memsize = 0; }; struct bark_model { // encoder gpt_model coarse_model; gpt_model fine_model; gpt_model text_model; // decoder encodec_model codec_model; // vocab bark_vocab vocab; int32_t memsize = 0; }; bool gpt_model_load(const std::string& fname, gpt_model& model); bool bark_model_load(const std::string & dirname, bark_model & model); bool bark_vocab_load(const std::string& fname, bark_vocab& vocab, int32_t expected_size); void bert_tokenize( const bark_vocab& vocab, const char * text, int32_t * tokens, int32_t * n_tokens, int32_t n_max_tokens); bool bark_generate_audio( bark_model model, const bark_vocab& vocab, const char * text, const int n_threads);