-
Notifications
You must be signed in to change notification settings - Fork 24
/
starcoder.cpp
115 lines (93 loc) · 5.19 KB
/
starcoder.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
namespace v2
{
struct Config : public BaseConfig
{
int num_key_value_heads;
int sliding_window;
float rope_theta;
};
const int SLIDING_WINDOW_LEN = 4096;
class Tokenizer : public BaseTokenizer
{
public:
Tokenizer(const Config &config)
: Tokenizer(config, nullptr)
{}
Tokenizer(const Config &config, BaseHistoryEncoder *encoder)
: BaseTokenizer::BaseTokenizer(config, encoder)
{
sys_prompt = "";
}
size_t load(tokenizer::DataReader *buffer, int n_vocab) override;
};
class ConditionalGeneration : public BaseModelForConditionalGeneration
{
public:
typedef Model<Config, Embedding, LayerNorm, StarCoder2Block<SLIDING_WINDOW_LEN>, int, int, int, int, int> ModelClass;
public:
ConditionalGeneration(const Config &config);
void load(ModelLoader &loader) override;
public:
static constexpr size_t MEM_SIZE = 1812ull * 1024 * 1024;
static constexpr size_t SCRATCH_SIZE = 444ull * 1024 * 1024;
Config config;
private:
// hold ggml_context & kv_cache
InitContext w_ctx_; // weight context
};
size_t Tokenizer::load(tokenizer::DataReader *buffer, int n_vocab)
{
tp = new tokenizer::BPEProcessor2();
size_t size = tp->Load(buffer, n_vocab);
return size;
}
ConditionalGeneration::ConditionalGeneration(const Config &config)
: BaseModelForConditionalGeneration(MODEL_TYPE_STARCODER2, config, MEM_SIZE, SCRATCH_SIZE), config(config)
{
constexpr size_t tensor_ovhd = GGML_TENSOR_SIZE + GGML_OBJECT_SIZE;
const size_t num_tensors = 3 + config.num_hidden_layers * 20;
const size_t ctx_size = num_tensors * tensor_ovhd;
w_ctx_.gctx = GGMLContext({.mem_size = ctx_size, .mem_buffer = nullptr, .no_alloc = true});
w_ctx_.dtype = config.dtype;
CHATLLM_CHECK(config.sliding_window == SLIDING_WINDOW_LEN)
<< "sliding_window (" << config.sliding_window << ") must be " << SLIDING_WINDOW_LEN;
transformer = new ModelClass(&w_ctx_, config, nullptr,
config.hidden_size, config.num_attention_heads,
config.intermediate_size, config.num_key_value_heads, config.max_length);
for (int i = 0; i < config.num_hidden_layers; i++)
{
auto &layer = get_typed_transformer<ModelClass>()->layers[i];
layer.attention.freq_base = config.rope_theta;
}
batch_input = false;
}
void ConditionalGeneration::load(ModelLoader &loader)
{
auto transformer = get_typed_transformer<ModelClass>();
loader.read_tensor("model.embed_tokens.weight", transformer->word_embeddings.weight);
for (int i = 0; i < config.num_hidden_layers; i++)
{
std::string layer_prefix = "model.layers." + std::to_string(layer_ids[i]) + '.';
loader.read_tensor(layer_prefix + "input_layernorm.weight", transformer->layers[i].input_layernorm.weight);
loader.read_tensor(layer_prefix + "input_layernorm.bias", transformer->layers[i].input_layernorm.bias);
loader.read_tensor(layer_prefix + "mlp.c_fc.weight", transformer->layers[i].mlp.fc0.weight);
loader.read_tensor(layer_prefix + "mlp.c_fc.bias", transformer->layers[i].mlp.fc0.bias);
loader.read_tensor(layer_prefix + "mlp.c_proj.weight", transformer->layers[i].mlp.fc1.weight);
loader.read_tensor(layer_prefix + "mlp.c_proj.bias", transformer->layers[i].mlp.fc1.bias);
loader.read_tensor(layer_prefix + "post_attention_layernorm.weight", transformer->layers[i].post_attention_layernorm.weight);
loader.read_tensor(layer_prefix + "post_attention_layernorm.bias", transformer->layers[i].post_attention_layernorm.bias);
loader.read_tensor(layer_prefix + "self_attn.k_proj.weight", transformer->layers[i].attention.k_proj.weight);
loader.read_tensor(layer_prefix + "self_attn.k_proj.bias", transformer->layers[i].attention.k_proj.bias);
loader.read_tensor(layer_prefix + "self_attn.o_proj.weight", transformer->layers[i].attention.o_proj.weight);
loader.read_tensor(layer_prefix + "self_attn.o_proj.bias", transformer->layers[i].attention.o_proj.bias);
loader.read_tensor(layer_prefix + "self_attn.q_proj.weight", transformer->layers[i].attention.q_proj.weight);
loader.read_tensor(layer_prefix + "self_attn.q_proj.bias", transformer->layers[i].attention.q_proj.bias);
loader.read_tensor(layer_prefix + "self_attn.v_proj.weight", transformer->layers[i].attention.v_proj.weight);
loader.read_tensor(layer_prefix + "self_attn.v_proj.bias", transformer->layers[i].attention.v_proj.bias);
}
loader.read_tensor("model.norm.weight", transformer->final_layernorm.weight);
loader.read_tensor("model.norm.bias", transformer->final_layernorm.bias);
CHATLLM_CHECK(ggml_used_mem(w_ctx_.gctx.get()) == ggml_get_mem_size(w_ctx_.gctx.get()))
<< "corrupted model weights";
}
}