ENH Introduce bark context (PABannier#101)

harperreed · Sep 1, 2023 · e33b060 · e33b060
1 parent 441490b
commit e33b060
Show file tree

Hide file tree

Showing 9 changed files with 280 additions and 237 deletions.
diff --git a/bark.cpp b/bark.cpp
diff --git a/bark.h b/bark.h
@@ -27,6 +27,7 @@
 #define COARSE_SEMANTIC_PAD_TOKEN 12048
 #define COARSE_INFER_TOKEN 12050
 
+
 struct bark_params {
  int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
 
@@ -115,7 +116,14 @@ struct gpt_model {
  struct ggml_context * ctx;
  std::map<std::string, struct ggml_tensor *> tensors;
 
+ // 
+ int64_t t_sample_us = 0;
+ int64_t t_predict_us = 0;
+ int64_t t_main_us = 0;
+
+ //
  int64_t memsize = 0;
+ size_t mem_per_token = 0;
 };
 
 
@@ -134,32 +142,54 @@ struct bark_model {
  int64_t memsize = 0;
 };
 
+struct bark_context {
+ bark_context(bark_model & model) : model(model) {}
+ ~bark_context() {
+ if (model_owner) {
+ delete &model;
+ }
+ }
+
+ std::mt19937 rng;
+
+ bark_model & model;
+
+ bool model_owner = false;
+
+ int64_t t_load_us;
+ int64_t t_start_us;
+
+ bark_sequence tokens;
+
+ bark_sequence semantic_tokens;
+
+ bark_codes coarse_tokens;
+
+ bark_codes fine_tokens;
+
+ std::vector<float> audio_arr;
+};
+
+void bark_free(bark_context * ctx);
+
 bool gpt_model_load(const std::string& fname, gpt_model& model);
 
 bool gpt_eval(
- const gpt_model & model,
+   gpt_model & model,
  bark_vocab::id * tokens,
  int n_tokens,
  float * logits,
  int * n_past,
  bool merge_ctx,
- int n_threads,
- size_t & mem_per_token);
+ int n_threads);
 
 bool fine_gpt_eval(
- const gpt_model & model,
+   gpt_model & model,
  bark_vocab::id * tokens,
  int n_tokens,
  float * logits,
  int n_threads,
- int codebook_ix,
- size_t & mem_per_token);
-
-bark_vocab::id gpt_sample(
- std::vector<float> & logits,
- std::mt19937 & rng,
- float temp,
- float * eos_p);
+ int codebook_ix);
 
 bool bark_model_load(const std::string & dirname, bark_model & model);
 
@@ -173,40 +203,30 @@ void bert_tokenize(
  int32_t n_max_tokens);
 
 bool bark_generate_audio(
- bark_model model,
- const bark_vocab& vocab,
- const char * text,
- const int n_threads,
- const int32_t seed,
- const std::string& dest_wav_path);
-
-bark_sequence bark_forward_text_encoder(
- const bark_sequence & tokens,
- const gpt_model model,
- std::mt19937 & rng,
- const int n_threads,
- const float temp,
- const float min_eos_p);
-
-bark_codes bark_forward_coarse_encoder(
- const bark_sequence & tokens,
- const gpt_model model,
- std::mt19937 & rng,
- const int n_threads,
- const float temp,
- const int max_coarse_history,
- const int sliding_window_size);
-
-bark_codes bark_forward_fine_encoder(
- const bark_codes & tokens,
- const gpt_model model,
- std::mt19937 & rng,
- const int n_threads,
- const float temp);
-
-audio_arr_t bark_forward_encodec(
- const bark_codes & tokens,
- const encodec_model model);
+ struct bark_context * ctx,
+ const char * text,
+ std::string & dest_wav_path,
+ int n_threads);
+
+void bark_forward_text_encoder(
+ struct bark_context * ctx,
+ float temp,
+ float min_eos_p,
+ int n_threads);
+
+void bark_forward_coarse_encoder(
+ struct bark_context * ctx,
+ int max_coarse_history,
+ int sliding_window_size,
+ float temp,
+ int n_threads);
+
+void bark_forward_fine_encoder(
+ struct bark_context * ctx,
+ float temp, 
+ int n_threads);
+
+void bark_forward_encodec(struct bark_context * ctx);
 
 struct bark_progress {
  float current = 0.0f;
@@ -240,4 +260,6 @@ void read_tensor_from_file(std::ifstream & fin, struct ggml_tensor * t);
 
 bool allequal(struct ggml_tensor * a, struct ggml_tensor * b, std::string test_name);
 
-bool allclose(struct ggml_tensor * a, struct ggml_tensor * b, float tol, std::string test_name);
+bool allclose(struct ggml_tensor * a, struct ggml_tensor * b, float tol, std::string test_name);
+
+struct bark_context * bark_new_context_with_model(struct bark_model * model);
diff --git a/encodec.h b/encodec.h
@@ -121,7 +121,11 @@ struct encodec_model {
 
  std::map<std::string, struct ggml_tensor *> tensors;
 
+ int64_t t_predict_us = 0;
+ int64_t t_main_us = 0;
+
  int64_t memsize = 0;
+ size_t mem_per_token = 0;
 };
 
 

diff --git a/examples/main.cpp b/examples/main.cpp
@@ -34,6 +34,13 @@ int main(int argc, char **argv) {
  t_load_us = ggml_time_us() - t_start_us;
  }
 
+ // create a context
+ bark_context * ctx = bark_new_context_with_model(&model);
+ if (ctx == nullptr) {
+ fprintf(stderr, "%s: failed to create context\n", __func__);
+ return 1;
+ }
+
  printf("\n");
 
  std::string prompt = "this is an audio";
@@ -42,7 +49,7 @@ int main(int argc, char **argv) {
  }
 
  const int64_t t_eval_us_start = ggml_time_us();
- bark_generate_audio(model, model.vocab, prompt.data(), params.n_threads, params.seed, params.dest_wav_path);
+ bark_generate_audio(ctx, prompt.data(), params.dest_wav_path, params.n_threads);
  t_eval_us = ggml_time_us() - t_eval_us_start;
 
  // report timing
@@ -55,11 +62,7 @@ int main(int argc, char **argv) {
  printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
  }
 
- // TODO: write wrapper
- ggml_free(model.coarse_model.ctx);
- ggml_free(model.fine_model.ctx);
- ggml_free(model.text_model.ctx);
- ggml_free(model.codec_model.ctx);
+ bark_free(ctx);
 
  return 0;
 }
diff --git a/tests/test-fine-gpt-eval.cpp b/tests/test-fine-gpt-eval.cpp
@@ -38,9 +38,8 @@ int main() {
  std::vector<float> gt_logits, logits;
 
  // dry run to estimate mem_per_token
- size_t mem_per_token = 0;
  bark_sequence decoy = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
- fine_gpt_eval(model, decoy.data(), decoy.size(), nullptr, n_threads, 2, mem_per_token);
+ fine_gpt_eval(model, decoy.data(), decoy.size(), nullptr, n_threads, 2);
 
  for (int i = 0; i < (int) test_args.size(); i++) {
  std::string path = std::get<0>(test_args[i]);
@@ -56,7 +55,7 @@ int main() {
  std::vector<int> tokens_vec = flatten(tokens);
 
  logits.resize(1024*1056);
- fine_gpt_eval(model, tokens_vec.data(), tokens_vec.size(), logits.data(), n_threads, codebook_ix, mem_per_token);
+ fine_gpt_eval(model, tokens_vec.data(), tokens_vec.size(), logits.data(), n_threads, codebook_ix);
 
  printf("\n");
  printf("%s: %s\n", __func__, path.c_str());

diff --git a/tests/test-forward-coarse.cpp b/tests/test-forward-coarse.cpp
@@ -22,12 +22,16 @@ int main() {
 
  std::mt19937 rng(0);
 
- gpt_model model;
- if(!gpt_model_load(fname, model)) {
+ bark_model model;
+
+ if(!gpt_model_load(fname, model.coarse_model)) {
  fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
  return 1;
  }
 
+ bark_context * ctx = bark_new_context_with_model(&model);
+ ctx->rng = rng;
+
  bark_sequence input;
  bark_codes gt_tokens;
 
@@ -37,18 +41,20 @@ int main() {
 
  std::string path = test_data[i];
  load_test_data(path, input, gt_tokens);
+ ctx->semantic_tokens = input;
 
- bark_codes tokens = bark_forward_coarse_encoder(
- input, model, rng, n_threads, temp, max_coarse_history, sliding_window_size);
+ bark_forward_coarse_encoder(ctx, max_coarse_history, sliding_window_size, temp, n_threads);
 
  printf("\n");
  printf("%s: %s\n", __func__, path.c_str());
- if (!run_test(transpose(gt_tokens), tokens)) {
+ if (!run_test(transpose(gt_tokens), ctx->coarse_tokens)) {
  printf("%s: test %d failed.\n", __func__, i+1);
  } else {
  printf("%s: test %d passed.\n", __func__, i+1);
  }
  }
 
+ bark_free(ctx);
+
  return 0;
 }
diff --git a/tests/test-forward-fine.cpp b/tests/test-forward-fine.cpp
@@ -20,25 +20,29 @@ int main() {
 
  std::mt19937 rng(0);
 
- gpt_model model;
- if(!gpt_model_load(fname, model)) {
+ bark_model model;
+
+ if(!gpt_model_load(fname, model.fine_model)) {
  fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
  return 1;
  }
 
- bark_codes input, gt_tokens, tokens;
+ bark_context * ctx = bark_new_context_with_model(&model);
+ ctx->rng = rng;
+
+ bark_codes input, gt_tokens;
 
  for (int i = 0; i < (int) test_data.size(); i++) {
  input.clear();
  gt_tokens.clear();
- tokens.clear();
 
  std::string path = test_data[i];
  load_test_data(path, input, gt_tokens);
 
  // TODO: need to remove transpose
- bark_codes input_t = transpose(input);
- bark_codes tokens = transpose(bark_forward_fine_encoder(input_t, model, rng, n_threads, temp));
+ ctx->coarse_tokens = transpose(input);
+ bark_forward_fine_encoder(ctx, temp, n_threads);
+ bark_codes tokens = transpose(ctx->fine_tokens);
 
  printf("\n");
  printf("%s: %s\n", __func__, path.c_str());
@@ -49,5 +53,7 @@ int main() {
  }
  }
 
+ bark_free(ctx);
+
  return 0;
 }
diff --git a/tests/test-forward-semantic.cpp b/tests/test-forward-semantic.cpp
@@ -21,12 +21,16 @@ int main() {
 
  std::mt19937 rng(0);
 
- gpt_model model;
- if(!gpt_model_load(fname, model)) {
+ bark_model model;
+
+ if(!gpt_model_load(fname, model.text_model)) {
  fprintf(stderr, "%s: invalid model file '%s'\n", __func__, fname.c_str());
  return 1;
  }
 
+ bark_context * ctx = bark_new_context_with_model(&model);
+ ctx->rng = rng;
+
  bark_sequence input;
  bark_sequence gt_tokens;
 
@@ -36,18 +40,20 @@ int main() {
 
  std::string path = test_data[i];
  load_test_data(path, input, gt_tokens);
+ ctx->tokens = input;
 
- bark_sequence tokens = bark_forward_text_encoder(
- input, model, rng, n_threads, temp, min_eos_p);
+ bark_forward_text_encoder(ctx, temp, min_eos_p, n_threads);
 
  printf("\n");
  printf("%s: %s\n", __func__, path.c_str());
- if (!run_test(gt_tokens, tokens)) {
+ if (!run_test(gt_tokens, ctx->semantic_tokens)) {
  printf("%s: test %d failed.\n", __func__, i+1);
  } else {
  printf("%s: test %d passed.\n", __func__, i+1);
  }
  }
 
+ bark_free(ctx);
+
  return 0;
 }
diff --git a/tests/test-gpt-eval.cpp b/tests/test-gpt-eval.cpp
@@ -34,11 +34,10 @@ int main() {
  logits.resize(n_vocab);
 
  // dry run to estimate mem_per_token
- size_t mem_per_token = 0;
  {
  int n_past = 0;
  bark_vocab::id decoy[4] = { 0, 1, 2, 3 };
- gpt_eval(model, decoy, 4, nullptr, &n_past, false, n_threads, mem_per_token);
+ gpt_eval(model, decoy, 4, nullptr, &n_past, false, n_threads);
  }
 
  for (int i = 0; i < (int) test_args.size(); i++) {
@@ -51,7 +50,7 @@ int main() {
  load_test_data(path, tokens, gt_logits);
 
  int n_past = 0;
- gpt_eval(model, tokens.data(), tokens.size(), logits.data(), &n_past, merge_ctx, n_threads, mem_per_token);
+ gpt_eval(model, tokens.data(), tokens.size(), logits.data(), &n_past, merge_ctx, n_threads);
 
  printf("\n");
  printf("%s: %s\n", __func__, path.c_str());