Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for DeepseekV2ForCausalLM #7519

Merged
merged 30 commits into from
May 28, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
c8c353f
Added initial support for DeepseekV2ForCausalLM.
sszymczy May 16, 2024
b24c9ed
Merge branch 'ggerganov:master' into deepseek-v2
fairydreaming May 17, 2024
0398964
Removed unnecessary tensor operations.
sszymczy May 18, 2024
b50c07c
Added five new DeepSeek-V2-specific parameters:
sszymczy May 18, 2024
79f8417
Added initial support for DeepSeek-V2-Lite model.
sszymczy May 18, 2024
6050941
Corrected mscale calculation.
sszymczy May 18, 2024
7e4786b
Added expert_weights_scale parameter for scaling MoE gate weights.
sszymczy May 19, 2024
71a7422
Temporarily hard-coded mscale value for DeepSeek-V2 (FIXME!).
sszymczy May 19, 2024
f99df46
Replaced hardcoded mscale value with rescaling attn_factor that resul…
sszymczy May 19, 2024
3ae7235
Whitespace formatting fixes.
sszymczy May 19, 2024
68a5103
Referenced the relevant GitHub discussion instead of providing long c…
sszymczy May 20, 2024
7be56da
Added YaRN log multiplier model header parameter corresponding to the…
sszymczy May 20, 2024
842ff3f
Added 16B and 236B model types for DeepSeek-V2.
sszymczy May 21, 2024
c033958
Removed usage of output bias tensor since it's not present in DeepSee…
sszymczy May 21, 2024
a54685b
Merge remote-tracking branch 'upstream/master' into deepseek-v2
sszymczy May 24, 2024
bb9c361
gguf-py : re-add SCALING_YARN_LOG_MUL removed during merge by accident
sszymczy May 24, 2024
f3b5e7d
llama : correct llm_build_moe_ffn() arguments in build_arctic()
sszymczy May 26, 2024
abef8b2
llama : code style corrections
sszymczy May 27, 2024
a654cd9
llama : rename n_expert_ff to n_ff_exp
sszymczy May 27, 2024
5a3e6b6
llama : rename qk_rope_head_dim, qk_nope_head_dim variables to n_embd…
sszymczy May 27, 2024
20769c0
llama : remove trailing whitespaces
sszymczy May 27, 2024
fac1e80
llama : rename moe_intermediate_size variable to n_ff_exp
sszymczy May 27, 2024
56f7011
llama : rename n_leading_dense_layer to n_layer_dense_lead
sszymczy May 27, 2024
82cec8b
llama : use attn_factor in mscale calculation to match the rope_yarn(…
sszymczy May 27, 2024
5cc7ec1
llama : rename query_states, key_states, value_states to q_states, k_…
sszymczy May 27, 2024
d02130d
llama : print DeekSeek-V2-specific parameters in llm_load_print_meta()
sszymczy May 27, 2024
bde971a
convert-hf : fix flake8 Lint errors
sszymczy May 27, 2024
98ff6e1
Merge remote-tracking branch 'upstream/master' into deepseek-v2
sszymczy May 28, 2024
841cd47
llama : replace ggml_new_tensor_3d + ggml_set_inplace + ggml_set_inpl…
sszymczy May 28, 2024
3efb659
gguf-py, llama : whitespace formatting fixes
sszymczy May 28, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Prev Previous commit
Next Next commit
Added five new DeepSeek-V2-specific parameters:
- leading_dense_block_count => hparams.n_leading_dense_layer,
- expert_feed_forward_length => hparams.n_expert_ff,
- expert_shared_count => hparams.n_expert_shared,
- attention.q_lora_rank => hparams.n_lora_q,
- attention.kv_lora_rank => hparams.n_lora_kv
  • Loading branch information
sszymczy committed May 18, 2024
commit b50c07c247488736112240e0381e42a8333aaea8
13 changes: 9 additions & 4 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2401,7 +2401,16 @@ def set_vocab(self):
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams

self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(hparams["v_head_dim"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])

if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
Expand All @@ -2410,10 +2419,6 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])

self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(hparams["v_head_dim"])
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])

_experts: list[dict[str, Tensor]] | None = None

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
Expand Down
5 changes: 5 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,14 @@ class LLM:
CONTEXT_LENGTH = "{arch}.context_length"
EMBEDDING_LENGTH = "{arch}.embedding_length"
BLOCK_COUNT = "{arch}.block_count"
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
EXPERT_COUNT = "{arch}.expert_count"
EXPERT_USED_COUNT = "{arch}.expert_used_count"
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
POOLING_TYPE = "{arch}.pooling_type"
LOGIT_SCALE = "{arch}.logit_scale"

Expand All @@ -55,6 +58,8 @@ class Attention:
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
CAUSAL = "{arch}.attention.causal"
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"

class Rope:
DIMENSION_COUNT = "{arch}.rope.dimension_count"
Expand Down
15 changes: 15 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,9 +376,15 @@ def add_embedding_length(self, length: int) -> None:
def add_block_count(self, length: int) -> None:
self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length)

def add_leading_dense_block_count(self, length: int) -> None:
self.add_uint32(Keys.LLM.LEADING_DENSE_BLOCK_COUNT.format(arch=self.arch), length)

def add_feed_forward_length(self, length: int) -> None:
self.add_uint32(Keys.LLM.FEED_FORWARD_LENGTH.format(arch=self.arch), length)

def add_expert_feed_forward_length(self, length: int) -> None:
self.add_uint32(Keys.LLM.EXPERT_FEED_FORWARD_LENGTH.format(arch=self.arch), length)

def add_parallel_residual(self, use: bool) -> None:
self.add_bool(Keys.LLM.USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)

Expand Down Expand Up @@ -409,6 +415,9 @@ def add_expert_count(self, count: int) -> None:
def add_expert_used_count(self, count: int) -> None:
self.add_uint32(Keys.LLM.EXPERT_USED_COUNT.format(arch=self.arch), count)

def add_expert_shared_count(self, count: int) -> None:
self.add_uint32(Keys.LLM.EXPERT_SHARED_COUNT.format(arch=self.arch), count)

def add_layer_norm_eps(self, value: float) -> None:
self.add_float32(Keys.Attention.LAYERNORM_EPS.format(arch=self.arch), value)

Expand All @@ -418,6 +427,12 @@ def add_layer_norm_rms_eps(self, value: float) -> None:
def add_causal_attention(self, value: bool) -> None:
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)

def add_q_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.Q_LORA_RANK.format(arch=self.arch), length)

def add_kv_lora_rank(self, length: int) -> None:
self.add_uint32(Keys.Attention.KV_LORA_RANK.format(arch=self.arch), length)

def add_pooling_type(self, value: PoolingType) -> None:
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)

Expand Down
53 changes: 37 additions & 16 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,11 +288,14 @@ enum llm_kv {
LLM_KV_CONTEXT_LENGTH,
LLM_KV_EMBEDDING_LENGTH,
LLM_KV_BLOCK_COUNT,
LLM_KV_LEADING_DENSE_BLOCK_COUNT,
LLM_KV_FEED_FORWARD_LENGTH,
LLM_KV_EXPERT_FEED_FORWARD_LENGTH,
LLM_KV_USE_PARALLEL_RESIDUAL,
LLM_KV_TENSOR_DATA_LAYOUT,
LLM_KV_EXPERT_COUNT,
LLM_KV_EXPERT_USED_COUNT,
LLM_KV_EXPERT_SHARED_COUNT,
LLM_KV_POOLING_TYPE,
LLM_KV_LOGIT_SCALE,

Expand All @@ -305,6 +308,8 @@ enum llm_kv {
LLM_KV_ATTENTION_LAYERNORM_EPS,
LLM_KV_ATTENTION_LAYERNORM_RMS_EPS,
LLM_KV_ATTENTION_CAUSAL,
LLM_KV_ATTENTION_Q_LORA_RANK,
LLM_KV_ATTENTION_KV_LORA_RANK,

LLM_KV_ROPE_DIMENSION_COUNT,
LLM_KV_ROPE_FREQ_BASE,
Expand Down Expand Up @@ -365,11 +370,14 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_CONTEXT_LENGTH, "%s.context_length" },
{ LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" },
{ LLM_KV_BLOCK_COUNT, "%s.block_count" },
{ LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" },
{ LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" },
{ LLM_KV_EXPERT_FEED_FORWARD_LENGTH, "%s.expert_feed_forward_length" },
{ LLM_KV_USE_PARALLEL_RESIDUAL, "%s.use_parallel_residual" },
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
{ LLM_KV_EXPERT_SHARED_COUNT, "%s.expert_shared_count" },
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },

Expand All @@ -382,6 +390,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
{ LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" },
{ LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" },
{ LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" },
{ LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" },
{ LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" },

{ LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" },
{ LLM_KV_ROPE_FREQ_BASE, "%s.rope.freq_base" },
Expand Down Expand Up @@ -1803,6 +1813,12 @@ struct llama_hparams {
uint32_t n_expert_used = 0;
uint32_t n_vocab_type = 0; // for BERT-style token types

uint32_t n_leading_dense_layer = 0;
ggerganov marked this conversation as resolved.
Show resolved Hide resolved
uint32_t n_lora_q = 0;
uint32_t n_lora_kv = 0;
uint32_t n_expert_ff = 0;
ggerganov marked this conversation as resolved.
Show resolved Hide resolved
uint32_t n_expert_shared = 0;

float f_norm_eps;
float f_norm_rms_eps;

Expand Down Expand Up @@ -1842,6 +1858,12 @@ struct llama_hparams {
if (this->n_expert != other.n_expert) return true;
if (this->n_expert_used != other.n_expert_used) return true;

if (this->n_leading_dense_layer != other.n_leading_dense_layer) return true;
if (this->n_lora_q != other.n_lora_q) return true;
if (this->n_lora_kv != other.n_lora_kv) return true;
if (this->n_expert_ff != other.n_expert_ff) return true;
if (this->n_expert_shared != other.n_expert_shared) return true;

if (this->rope_finetuned != other.rope_finetuned) return true;
if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;

Expand Down Expand Up @@ -4306,6 +4328,12 @@ static void llm_load_hparams(
case LLM_ARCH_DEEPSEEK2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_leading_dense_layer);
ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_expert_ff);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);

model.type = e_model::MODEL_UNKNOWN;
} break;
default: (void)0;
Expand Down Expand Up @@ -6107,16 +6135,12 @@ static bool llm_load_tensors(
} break;
case LLM_ARCH_DEEPSEEK2:
{
// TODO maybe move some of these to hparams
const uint32_t n_shared_experts = 2;
const uint32_t moe_intermediate_size = 1536;
const uint32_t q_lora_rank = 1536;
const uint32_t kv_lora_rank = 512;
const uint32_t first_k_dense_replace = 1;

// kept original names of these parameters from HF transformers code for clarity
const uint32_t qk_rope_head_dim = hparams.n_rot;
const uint32_t qk_nope_head_dim = hparams.n_embd_head_k - hparams.n_rot;
const uint32_t q_lora_rank = hparams.n_lora_q;
const uint32_t kv_lora_rank = hparams.n_lora_kv;
const uint32_t moe_intermediate_size = hparams.n_expert_ff;
ggerganov marked this conversation as resolved.
Show resolved Hide resolved

model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});

Expand Down Expand Up @@ -6144,7 +6168,7 @@ static bool llm_load_tensors(

layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});

if ((uint32_t) i < first_k_dense_replace) {
if ((uint32_t) i < hparams.n_leading_dense_layer) {
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
Expand All @@ -6160,9 +6184,9 @@ static bool llm_load_tensors(
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, moe_intermediate_size, n_expert});

// Shared expert branch
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, moe_intermediate_size * n_shared_experts});
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { moe_intermediate_size * n_shared_experts, n_embd});
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, moe_intermediate_size * n_shared_experts});
layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, moe_intermediate_size * hparams.n_expert_shared});
layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { moe_intermediate_size * hparams.n_expert_shared, n_embd});
layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, moe_intermediate_size * hparams.n_expert_shared});
}
}
} break;
Expand Down Expand Up @@ -10893,13 +10917,10 @@ struct llm_build_context {
// mutable variable, needed during the last layer of the computation to skip unused tokens
int32_t n_tokens = this->n_tokens;

// TODO maybe move some of these to hparams
const uint32_t first_k_dense_replace = 1;
const uint32_t kv_lora_rank = 512;

// kept original names of these parameters from HF transformers code for clarity
const uint32_t qk_rope_head_dim = hparams.n_rot;
const uint32_t qk_nope_head_dim = hparams.n_embd_head_k - hparams.n_rot;
const uint32_t kv_lora_rank = hparams.n_lora_kv;

struct ggml_tensor * cur;
struct ggml_tensor * inpL;
Expand Down Expand Up @@ -11022,7 +11043,7 @@ struct llm_build_context {
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
cb(ffn_inp, "ffn_inp", il);

if ((uint32_t) il < first_k_dense_replace) {
if ((uint32_t) il < hparams.n_leading_dense_layer) {
cur = llm_build_norm(ctx0, ffn_inp, hparams,
model.layers[il].ffn_norm, NULL,
LLM_NORM_RMS, cb, il);
Expand Down