Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add StarCoder/SantaCoder example #146

Merged
merged 14 commits into from
May 13, 2023
Prev Previous commit
Next Next commit
fix main
  • Loading branch information
NouamaneTazi committed May 12, 2023
commit fba0a14d777d31aee994d4f693ee1ab46757df84
13 changes: 8 additions & 5 deletions examples/starcoder/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,10 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;

const int head_dim = n_embd / hparams.n_head;
const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
const int kv_dim = kv_heads * head_dim;

ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

Expand All @@ -169,8 +173,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b

ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
ctx_size += n_layer*((n_embd + 2*kv_dim)*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w // TODO:
ctx_size += n_layer*( (n_embd + 2*kv_dim)*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b

ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
Expand Down Expand Up @@ -213,7 +217,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;

// MQA
const int head_dim = n_embd / hparams.n_head;
const int kv_heads = hparams.n_head; // 1 if MQA else hparams.n_head
const int kv_dim = kv_heads * head_dim;
Expand Down Expand Up @@ -508,7 +511,7 @@ bool gpt2_eval(
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3);
0, 2, 1, 3); //TODO: need to be tiled

// GG: flash attention
//struct ggml_tensor * V =
Expand All @@ -524,7 +527,7 @@ bool gpt2_eval(

// K * Q
// [n_past + N, N, 12]
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); //TODO: check if it broadcasts

// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
Expand Down