Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT Add forward pass of coarse model #5

Merged
merged 4 commits into from
Jul 16, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
CLN
  • Loading branch information
PABannier committed Jul 16, 2023
commit f7f03c368076f7363b1b527f326e05fe4e1c831c
98 changes: 36 additions & 62 deletions bark.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/*
Port of Suno's Bark to C/C++.

Author: Pierre-Antoine Bannier<[email protected]>
Author: Pierre-Antoine Bannier <[email protected]>

Note on tokenization
--------------------
Expand Down Expand Up @@ -506,8 +506,6 @@ bool gpt_eval(

struct ggml_tensor * embd;

struct ggml_tensor * toy;

if (!merge_ctx) {
// usually only one token is in the sequence (since the context is saved in
// memory_k and memory_v)
Expand Down Expand Up @@ -767,18 +765,6 @@ bool gpt_eval(
ggml_build_forward_expand(&gf, inpL);
ggml_graph_compute (ctx0, &gf);

if (toy) {
for (int i = 0; i < toy->ne[1]; i++) {
for (int j = 0; j < toy->ne[0]; j++) {
float v = *(float *) ((char *)toy->data + i*toy->nb[1] + j*toy->nb[0]);
printf("%.4f ", v);
}
printf("\n\n");
}

printf("dim: [%d, %d]\n", toy->ne[0], toy->ne[1]);
}

// return result just for the last token
embd_w.resize(n_vocab);
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
Expand Down Expand Up @@ -906,60 +892,48 @@ bool bark_generate_audio(
printf("\n\n");

// encode text (text model)
// std::vector<bark_vocab::id> inp_semantic;
// {
// int n_past = 0;
// float eos_p = 0;

// std::vector<bark_vocab::id> input = tokens;
// std::vector<float> logits;

// // dry run to estimate mem_per_token
// size_t mem_per_token = 0;
// gpt_eval(model.text_model, n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
std::vector<bark_vocab::id> inp_semantic;
{
int n_past = 0;
float eos_p = 0;

// for (int i = 0; i < 768; i++) {
// const bool merge_ctx = i == 0;
// gpt_eval(model.text_model, n_threads, n_past, merge_ctx, input, logits, mem_per_token);
std::vector<bark_vocab::id> input = tokens;
std::vector<float> logits;

// float logits_pad_token = logits[SEMANTIC_PAD_TOKEN];
// logits.resize(SEMANTIC_VOCAB_SIZE);
// dry run to estimate mem_per_token
size_t mem_per_token = 0;
gpt_eval(model.text_model, n_threads, 0, false, { 0, 1, 2, 3 }, logits, mem_per_token);

// if (early_stop)
// logits.push_back(logits[logits_pad_token]);
for (int i = 0; i < 768; i++) {
const bool merge_ctx = i == 0;
gpt_eval(model.text_model, n_threads, n_past, merge_ctx, input, logits, mem_per_token);

// if (i == 0)
// n_past += input.size() - 256; // first step, context are merged
// else
// n_past += input.size();
float logits_pad_token = logits[SEMANTIC_PAD_TOKEN];
logits.resize(SEMANTIC_VOCAB_SIZE);

// input.clear();
if (early_stop)
logits.push_back(logits[logits_pad_token]);

// bark_vocab::id sampled_id = gpt_sample(vocab, logits, temp, rng, &eos_p);
// input.push_back(sampled_id);
// inp_semantic.push_back(sampled_id);
if (i == 0)
n_past += input.size() - 256; // first step, context are merged
else
n_past += input.size();

// printf("%d ", sampled_id);
// fflush(stdout);
input.clear();

// if (early_stop && ((sampled_id == SEMANTIC_VOCAB_SIZE) || (eos_p > min_eos_p)))
// break;
// }
bark_vocab::id sampled_id = gpt_sample(vocab, logits, temp, rng, &eos_p);
input.push_back(sampled_id);
inp_semantic.push_back(sampled_id);

// printf("\n\ntext semantic sequence length: %d\n", inp_semantic.size());
printf("%d ", sampled_id);
fflush(stdout);

// }
if (early_stop && ((sampled_id == SEMANTIC_VOCAB_SIZE) || (eos_p > min_eos_p)))
break;
}

std::vector<bark_vocab::id> inp_semantic = {
206, 3252, 206, 206, 7567, 206, 10, 3252, 3252, 3252, 206, 206, 10, 3174,
3981, 206, 2009, 147, 3961, 56, 56, 3961, 10, 296, 56, 56, 147, 296,
296, 147, 10, 302, 273, 8020, 8020, 1722, 59, 59, 9284, 7695, 4133, 6492,
92, 148, 234, 522, 1333, 3005, 41, 41, 33, 33, 140, 933, 6202, 6202,
6747, 8174, 2049, 7656, 9804, 9804, 3216, 17, 17, 113, 9414, 5419, 3831, 3831,
3663, 2224, 2224, 2224, 2224, 9144, 1667, 1667, 1667, 1667, 1191, 1667, 44, 1191,
326, 326, 33, 33, 33, 33, 10, 140, 335, 8103, 2064, 2064, 1538, 1538,
1538, 1538, 1538, 1538, 1538, 1538, 1538, 2311
};
printf("\n\nsemantic sequence length: %d\n\n", inp_semantic.size());
}

// coarse encoding (coarse model)
std::vector<bark_vocab::id> input_coarse;
Expand Down Expand Up @@ -993,18 +967,16 @@ bool bark_generate_audio(
for (int ix = original_size; ix < 256; ix++)
input_in[ix] = COARSE_SEMANTIC_PAD_TOKEN;

// concatenate input_in and input_coarse
input_in.push_back(COARSE_INFER_TOKEN);
// for (int ix = max_coarse_history; ix < input_coarse.size(); ix++)
// input_in.push_back(input_coarse[ix]);

// concatenate input_in and input_coarse
input_in.insert(
input_in.end(),
std::make_move_iterator(input_coarse.end() - std::min(max_coarse_history, (int) input_coarse.size())),
std::make_move_iterator(input_coarse.end())
);

int n_past = 0;

mem_per_token *= 1.1; // context length is growing, mem_per_token must grow as well

for(int j = 0; j < sliding_window_size; j++) {
Expand Down Expand Up @@ -1041,6 +1013,8 @@ bool bark_generate_audio(

// for n in range(1, N_COARSE_CODEBOOKS):
// gen_coarse_audio_arr[n, :] -= n * CODEBOOK_SIZE

printf("\n\ncoarse sequence length: %d\n\n", input_coarse.size());
}

}
Expand Down