clean forward pass text encoder

PABannier · PABannier · Feb 13, 2024 · Oct 26, 2023 · Oct 26, 2023 · Oct 26, 2023
commit 753d5cf26516bf144eb196445b76e63828f690b3
diff --git a/bark.cpp b/bark.cpp
@@ -42,131 +42,6 @@
 
 static const size_t MB = 1024*1024;
 
-typedef std::vector<int32_t> bark_sequence;
-typedef std::vector<std::vector<int32_t>> bark_codes;
-
-struct gpt_hparams {
- int32_t n_in_vocab;
- int32_t n_out_vocab;
- int32_t n_layer;
- int32_t n_head;
- int32_t n_embd;
- int32_t block_size;
- int32_t n_lm_heads;
- int32_t n_wtes;
- int32_t ftype;
-
- int32_t n_codes_given = 1;
-};
-
-struct bark_vocab {
- using id = int32_t;
- using token = std::string;
-
- std::map<token, id> token_to_id;
- std::map<id, token> id_to_token;
-};
-
-struct gpt_layer {
- // normalization
- struct ggml_tensor * ln_1_g;
- struct ggml_tensor * ln_1_b;
-
- struct ggml_tensor * ln_2_g;
- struct ggml_tensor * ln_2_b;
-
- // attention
- struct ggml_tensor * c_attn_attn_w;
- struct ggml_tensor * c_attn_attn_b;
-
- struct ggml_tensor * c_attn_proj_w;
- struct ggml_tensor * c_attn_proj_b;
-
- // mlp
- struct ggml_tensor * c_mlp_fc_w;
- struct ggml_tensor * c_mlp_fc_b;
-
- struct ggml_tensor * c_mlp_proj_w;
- struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt_model {
- gpt_hparams hparams;
-
- // normalization
- struct ggml_tensor * ln_f_g;
- struct ggml_tensor * ln_f_b;
-
- struct ggml_tensor * wpe; // position embedding
- std::vector<struct ggml_tensor *> wtes; // token embedding
- std::vector<struct ggml_tensor *> lm_heads; // language model head
-
- std::vector<gpt_layer> layers;
-
- // key + value memory
- struct ggml_tensor * memory_k;
- struct ggml_tensor * memory_v;
-
- struct ggml_context * ctx;
-
- ggml_backend_t backend = NULL;
-
- ggml_backend_buffer_t buffer_w;
- ggml_backend_buffer_t buffer_kv;
-
- std::map<std::string, struct ggml_tensor *> tensors;
-
- //
- int64_t t_sample_us = 0;
- int64_t t_predict_us = 0;
- int64_t t_main_us = 0;
-
- //
- int64_t n_sample = 0;
- int64_t n_predict = 0;
-
- //
- int64_t memsize = 0;
- size_t mem_per_token = 0;
-};
-
-struct bark_model {
- // encoder
- gpt_model coarse_model;
- gpt_model fine_model;
- gpt_model text_model;
-
- // vocab
- bark_vocab vocab;
-};
-
-struct bark_context {
- bark_model model;
-
- // buffer for model evaluation
- ggml_backend_buffer_t buf_compute;
-
- // custom allocator
- struct ggml_allocr * allocr = NULL;
-
- std::mt19937 rng;
-
- bark_sequence tokens;
- bark_sequence semantic_tokens;
-
- bark_codes coarse_tokens;
- bark_codes fine_tokens;
-
- std::vector<float> audio_arr;
-
- // hyperparameters
- bark_context_params params;
-
- // statistics
- int64_t t_load_us = 0;
- int64_t t_start_us = 0;
-
-};
 
 struct bark_progress {
  float current = 0.0f;
@@ -1330,9 +1205,7 @@ struct bark_context_params bark_context_default_params() {
  return result;
 }
 
-struct bark_context * bark_load_model(
- const std::string & model_path,
- const bark_context_params & params) {
+struct bark_context * bark_load_model(const std::string & model_path) {
  int64_t t_load_start_us = ggml_time_us();
 
  struct bark_context * bctx = new bark_context();
@@ -1343,6 +1216,7 @@ struct bark_context * bark_load_model(
  return {};
  }
 
+ bark_context_params params = bark_context_default_params();
  bctx->rng = std::mt19937(params.seed);
 
  bctx->params = params;

diff --git a/bark.h b/bark.h
@@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "ggml-backend.h"
 
 #include <map>
 #include <random>
@@ -19,11 +20,133 @@
 # define BARK_API
 #endif
 
-
 typedef int32_t bark_token;
 
-struct bark_context;
-struct bark_progress;
+typedef std::vector<int32_t> bark_sequence;
+typedef std::vector<std::vector<int32_t>> bark_codes;
+
+struct gpt_hparams {
+ int32_t n_in_vocab;
+ int32_t n_out_vocab;
+ int32_t n_layer;
+ int32_t n_head;
+ int32_t n_embd;
+ int32_t block_size;
+ int32_t n_lm_heads;
+ int32_t n_wtes;
+ int32_t ftype;
+
+ int32_t n_codes_given = 1;
+};
+
+struct bark_vocab {
+ using id = int32_t;
+ using token = std::string;
+
+ std::map<token, id> token_to_id;
+ std::map<id, token> id_to_token;
+};
+
+struct gpt_layer {
+ // normalization
+ struct ggml_tensor * ln_1_g;
+ struct ggml_tensor * ln_1_b;
+
+ struct ggml_tensor * ln_2_g;
+ struct ggml_tensor * ln_2_b;
+
+ // attention
+ struct ggml_tensor * c_attn_attn_w;
+ struct ggml_tensor * c_attn_attn_b;
+
+ struct ggml_tensor * c_attn_proj_w;
+ struct ggml_tensor * c_attn_proj_b;
+
+ // mlp
+ struct ggml_tensor * c_mlp_fc_w;
+ struct ggml_tensor * c_mlp_fc_b;
+
+ struct ggml_tensor * c_mlp_proj_w;
+ struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct gpt_model {
+ gpt_hparams hparams;
+
+ // normalization
+ struct ggml_tensor * ln_f_g;
+ struct ggml_tensor * ln_f_b;
+
+ struct ggml_tensor * wpe; // position embedding
+ std::vector<struct ggml_tensor *> wtes; // token embedding
+ std::vector<struct ggml_tensor *> lm_heads; // language model head
+
+ std::vector<gpt_layer> layers;
+
+ // key + value memory
+ struct ggml_tensor * memory_k;
+ struct ggml_tensor * memory_v;
+
+ struct ggml_context * ctx;
+
+ ggml_backend_t backend = NULL;
+
+ ggml_backend_buffer_t buffer_w;
+ ggml_backend_buffer_t buffer_kv;
+
+ std::map<std::string, struct ggml_tensor *> tensors;
+
+ //
+ int64_t t_sample_us = 0;
+ int64_t t_predict_us = 0;
+ int64_t t_main_us = 0;
+
+ //
+ int64_t n_sample = 0;
+ int64_t n_predict = 0;
+
+ //
+ int64_t memsize = 0;
+ size_t mem_per_token = 0;
+};
+
+struct bark_model {
+ // encoder
+ gpt_model coarse_model;
+ gpt_model fine_model;
+ gpt_model text_model;
+
+ // vocab
+ bark_vocab vocab;
+};
+
+struct bark_context {
+ bark_model model;
+
+ // buffer for model evaluation
+ ggml_backend_buffer_t buf_compute;
+
+ // custom allocator
+ struct ggml_allocr * allocr = NULL;
+
+ std::mt19937 rng;
+
+ bark_sequence tokens;
+ bark_sequence semantic_tokens;
+
+ bark_codes coarse_tokens;
+ bark_codes fine_tokens;
+
+ std::vector<float> audio_arr;
+
+ // hyperparameters
+ bark_context_params params;
+
+ // statistics
+ int64_t t_load_us = 0;
+ int64_t t_eval_us = 0;
+
+};
 
 struct bark_context_params {
  uint32_t seed; // RNG seed
@@ -41,13 +164,6 @@ struct bark_context_params {
  int max_coarse_history; 
 };
 
-struct bark_model;
-struct bark_vocab;
-
-struct gpt_hparams;
-struct gpt_layer;
-struct gpt_model;
-
 /**
  * @brief Returns the default parameters for a bark context.
  * 
@@ -63,8 +179,7 @@ BARK_API struct bark_context_params bark_context_default_params(void);
  * @return A pointer to the loaded bark model context.
  */
 BARK_API struct bark_context * bark_load_model(
- const std::string & model_path,
- const bark_context_params & params);
+ const std::string & model_path);
 
 /**
  * Generates an audio file from the given text using the specified Bark context.

diff --git a/examples/common.cpp b/examples/common.cpp
@@ -4,6 +4,8 @@
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"
 
+#include "common.h"
+
 #define SAMPLE_RATE 24000
 
 void write_wav_on_disk(std::vector<float> & audio_arr, std::string dest_path) {
@@ -21,3 +23,46 @@ void write_wav_on_disk(std::vector<float> & audio_arr, std::string dest_path) {
 
  fprintf(stderr, "%s: Number of frames written = %lld.\n", __func__, frames);
 }
+
+void bark_print_usage(char ** argv, const bark_params & params) {
+ fprintf(stderr, "usage: %s [options]\n", argv[0]);
+ fprintf(stderr, "\n");
+ fprintf(stderr, "options:\n");
+ fprintf(stderr, " -h, --help show this help message and exit\n");
+ fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
+ fprintf(stderr, " -s N, --seed N seed for random number generator (default: %d)\n", params.seed);
+ fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
+ fprintf(stderr, " prompt to start generation with (default: random)\n");
+ fprintf(stderr, " -m FNAME, --model FNAME\n");
+ fprintf(stderr, " model path (default: %s)\n", params.model_path.c_str());
+ fprintf(stderr, " -o FNAME, --outwav FNAME\n");
+ fprintf(stderr, " output generated wav (default: %s)\n", params.dest_wav_path.c_str());
+ fprintf(stderr, "\n");
+}
+
+int bark_params_parse(int argc, char ** argv, bark_params & params) {
+ for (int i = 1; i < argc; i++) {
+ std::string arg = argv[i];
+
+ if (arg == "-t" || arg == "--threads") {
+ params.n_threads = std::stoi(argv[++i]);
+ } else if (arg == "-p" || arg == "--prompt") {
+ params.prompt = argv[++i];
+ } else if (arg == "-m" || arg == "--model") {
+ params.model_path = argv[++i];
+ } else if (arg == "-s" || arg == "--seed") {
+ params.seed = std::stoi(argv[++i]);
+ } else if (arg == "-o" || arg == "--outwav") {
+ params.dest_wav_path = argv[++i];
+ } else if (arg == "-h" || arg == "--help") {
+ bark_print_usage(argv, params);
+ exit(0);
+ } else {
+ fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+ bark_print_usage(argv, params);
+ exit(0);
+ }
+ }
+
+ return 0;
+}