CLN

PABannier · PABannier · Jul 16, 2023 · Jul 15, 2023 · Jul 15, 2023 · Jul 16, 2023
commit f7f03c368076f7363b1b527f326e05fe4e1c831c
diff --git a/bark.cpp b/bark.cpp
@@ -1,7 +1,7 @@
 /*
 Port of Suno's Bark to C/C++.
 
-Author: Pierre-Antoine Bannier<[email protected]>
+Author: Pierre-Antoine Bannier <[email protected]>
 
 Note on tokenization
 --------------------
@@ -506,8 +506,6 @@ bool gpt_eval(
 
  struct ggml_tensor * embd;
 
- struct ggml_tensor * toy;
-
  if (!merge_ctx) {
  // usually only one token is in the sequence (since the context is saved in
  // memory_k and memory_v)
@@ -767,18 +765,6 @@ bool gpt_eval(
  ggml_build_forward_expand(&gf, inpL);
  ggml_graph_compute (ctx0, &gf);
 
- if (toy) {
- for (int i = 0; i < toy->ne[1]; i++) {
- for (int j = 0; j < toy->ne[0]; j++) {
- float v = *(float *) ((char *)toy->data + i*toy->nb[1] + j*toy->nb[0]);
- printf("%.4f ", v);
- }
- printf("\n\n");
- }
-
- printf("dim: [%d, %d]\n", toy->ne[0], toy->ne[1]);
- }
-
  // return result just for the last token
  embd_w.resize(n_vocab);
  memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
@@ -906,60 +892,48 @@ bool bark_generate_audio(
  printf("\n\n");
 
  // encode text (text model)
- // std::vector<bark_vocab::id> inp_semantic;
- // {
- // int n_past = 0;
- // float eos_p = 0;
-
- // std::vector<bark_vocab::id> input = tokens;
- // std::vector<float> logits;
-
- // // dry run to estimate mem_per_token
- // size_t mem_per_token = 0;
- // gpt_eval(model.text_model, n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
+ std::vector<bark_vocab::id> inp_semantic;
+ {
+ int n_past = 0;
+ float eos_p = 0;
 
- // for (int i = 0; i < 768; i++) {
- // const bool merge_ctx = i == 0;
- // gpt_eval(model.text_model, n_threads, n_past, merge_ctx, input, logits, mem_per_token);
+ std::vector<bark_vocab::id> input = tokens;
+ std::vector<float> logits;
 
- // float logits_pad_token = logits[SEMANTIC_PAD_TOKEN];
- // logits.resize(SEMANTIC_VOCAB_SIZE);
+ // dry run to estimate mem_per_token
+ size_t mem_per_token = 0;
+ gpt_eval(model.text_model, n_threads, 0, false, { 0, 1, 2, 3 }, logits, mem_per_token);
 
- // if (early_stop)
- // logits.push_back(logits[logits_pad_token]);
+ for (int i = 0; i < 768; i++) {
+ const bool merge_ctx = i == 0;
+ gpt_eval(model.text_model, n_threads, n_past, merge_ctx, input, logits, mem_per_token);
 
- // if (i == 0)
- // n_past += input.size() - 256; // first step, context are merged
- // else
- // n_past += input.size();
+ float logits_pad_token = logits[SEMANTIC_PAD_TOKEN];
+ logits.resize(SEMANTIC_VOCAB_SIZE);
 
- // input.clear();
+ if (early_stop)
+ logits.push_back(logits[logits_pad_token]);
 
- // bark_vocab::id sampled_id = gpt_sample(vocab, logits, temp, rng, &eos_p);
- // input.push_back(sampled_id);
- // inp_semantic.push_back(sampled_id);
+ if (i == 0)
+ n_past += input.size() - 256; // first step, context are merged
+ else
+ n_past += input.size();
 
- // printf("%d ", sampled_id);
- // fflush(stdout);
+ input.clear();
 
- //  if (early_stop && ((sampled_id == SEMANTIC_VOCAB_SIZE) || (eos_p > min_eos_p)))
- //  break;
- // }
+ bark_vocab::id sampled_id = gpt_sample(vocab, logits, temp, rng, &eos_p);
+ input.push_back(sampled_id);
+  inp_semantic.push_back(sampled_id);
 
- // printf("\n\ntext semantic sequence length: %d\n", inp_semantic.size());
+ printf("%d ", sampled_id);
+ fflush(stdout);
 
- // }
+ if (early_stop && ((sampled_id == SEMANTIC_VOCAB_SIZE) || (eos_p > min_eos_p)))
+ break;
+ }
 
- std::vector<bark_vocab::id> inp_semantic = {
- 206, 3252, 206, 206, 7567, 206, 10, 3252, 3252, 3252, 206, 206, 10, 3174,
- 3981, 206, 2009, 147, 3961, 56, 56, 3961, 10, 296, 56, 56, 147, 296,
- 296, 147, 10, 302, 273, 8020, 8020, 1722, 59, 59, 9284, 7695, 4133, 6492,
- 92, 148, 234, 522, 1333, 3005, 41, 41, 33, 33, 140, 933, 6202, 6202,
- 6747, 8174, 2049, 7656, 9804, 9804, 3216, 17, 17, 113, 9414, 5419, 3831, 3831,
- 3663, 2224, 2224, 2224, 2224, 9144, 1667, 1667, 1667, 1667, 1191, 1667, 44, 1191,
- 326, 326, 33, 33, 33, 33, 10, 140, 335, 8103, 2064, 2064, 1538, 1538,
- 1538, 1538, 1538, 1538, 1538, 1538, 1538, 2311
- };
+ printf("\n\nsemantic sequence length: %d\n\n", inp_semantic.size());
+ }
 
  // coarse encoding (coarse model)
  std::vector<bark_vocab::id> input_coarse;
@@ -993,18 +967,16 @@ bool bark_generate_audio(
  for (int ix = original_size; ix < 256; ix++)
  input_in[ix] = COARSE_SEMANTIC_PAD_TOKEN;
 
- // concatenate input_in and input_coarse
  input_in.push_back(COARSE_INFER_TOKEN);
- // for (int ix = max_coarse_history; ix < input_coarse.size(); ix++)
- //  input_in.push_back(input_coarse[ix]);
+
+ // concatenate input_in and input_coarse
  input_in.insert(
  input_in.end(),
  std::make_move_iterator(input_coarse.end() - std::min(max_coarse_history, (int) input_coarse.size())),
  std::make_move_iterator(input_coarse.end())
  );
 
  int n_past = 0;
-
  mem_per_token *= 1.1; // context length is growing, mem_per_token must grow as well
 
  for(int j = 0; j < sliding_window_size; j++) {
@@ -1041,6 +1013,8 @@ bool bark_generate_audio(
 
  // for n in range(1, N_COARSE_CODEBOOKS):
  // gen_coarse_audio_arr[n, :] -= n * CODEBOOK_SIZE
+
+ printf("\n\ncoarse sequence length: %d\n\n", input_coarse.size());
  }
 
 }