Skip to content

Commit

Permalink
examples : add quantize version to MPT and Replit examples (ref #168)
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed May 20, 2023
1 parent 3fe2082 commit c2fab8a
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 89 deletions.
35 changes: 20 additions & 15 deletions examples/mpt/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,14 @@ int n_ctx = 4096;

// no defaults for now
struct mpt_hparams {
int32_t d_model = 0;
int32_t max_seq_len = 0;
int32_t n_heads = 0;
int32_t n_layers = 0;
int32_t n_vocab = 0;
int32_t d_model = 0;
int32_t max_seq_len = 0;
int32_t n_heads = 0;
int32_t n_layers = 0;
int32_t n_vocab = 0;
float alibi_bias_max = 0;
float clip_qkv = 0;
int32_t ftype = 0;
float clip_qkv = 0;
int32_t ftype = 0;
};

struct mpt_layer {
Expand Down Expand Up @@ -88,14 +88,16 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
{
auto & hparams = model.hparams;

fin.read((char *)&hparams.d_model, sizeof(hparams.d_model));
fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *)&hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
fin.read((char *)&hparams.clip_qkv, sizeof(hparams.clip_qkv));
fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));
fin.read((char *) &hparams.d_model, sizeof(hparams.d_model));
fin.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
fin.read((char *) &hparams.n_heads, sizeof(hparams.n_heads));
fin.read((char *) &hparams.n_layers, sizeof(hparams.n_layers));
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
fin.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));

const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;

printf("%s: d_model = %d\n", __func__, hparams.d_model);
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
Expand All @@ -105,6 +107,9 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
printf("%s: qntvr = %d\n", __func__, qntvr);

hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}

// load vocab
Expand Down
70 changes: 38 additions & 32 deletions examples/mpt/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@
#include <vector>

struct mpt_hparams {
int32_t d_model = 0;
int32_t max_seq_len = 0;
int32_t n_heads = 0;
int32_t n_layers = 0;
int32_t n_vocab = 0;
int32_t d_model = 0;
int32_t max_seq_len = 0;
int32_t n_heads = 0;
int32_t n_layers = 0;
int32_t n_vocab = 0;
float alibi_bias_max = 0;
float clip_qkv = 0;
int32_t ftype = 0;
float clip_qkv = 0;
int32_t ftype = 0;
};

// quantize a model
Expand Down Expand Up @@ -61,32 +61,38 @@ bool mpt_model_quantize(const std::string & fname_inp,

// load hparams
{
finp.read((char *)&hparams.d_model, sizeof(hparams.d_model));
finp.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
finp.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
finp.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
finp.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
finp.read((char *)&hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
finp.read((char *)&hparams.clip_qkv, sizeof(hparams.clip_qkv));
finp.read((char *)&hparams.ftype, sizeof(hparams.ftype));

printf("%s: d_model = %d\n", __func__, hparams.d_model);
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
finp.read((char *) &hparams.d_model, sizeof(hparams.d_model));
finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
finp.read((char *) &hparams.n_heads, sizeof(hparams.n_heads));
finp.read((char *) &hparams.n_layers, sizeof(hparams.n_layers));
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
finp.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
finp.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));

const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;

printf("%s: d_model = %d\n", __func__, hparams.d_model);
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv);
printf("%s: ftype = %d\n", __func__, hparams.ftype);

fout.write((char *)&hparams.d_model, sizeof(hparams.d_model));
fout.write((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
fout.write((char *)&hparams.n_heads, sizeof(hparams.n_heads));
fout.write((char *)&hparams.n_layers, sizeof(hparams.n_layers));
fout.write((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *)&hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
fout.write((char *)&hparams.clip_qkv, sizeof(hparams.clip_qkv));
fout.write((char *)&ftype, sizeof(hparams.ftype));
printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv);
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);

fout.write((char *) &hparams.d_model, sizeof(hparams.d_model));
fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
fout.write((char *) &hparams.n_heads, sizeof(hparams.n_heads));
fout.write((char *) &hparams.n_layers, sizeof(hparams.n_layers));
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
fout.write((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}

// load vocab
Expand Down
41 changes: 23 additions & 18 deletions examples/replit/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,12 @@ std::string replit_tokenizer_detokenize(replit_tokenizer & tokenizer, const std:

// no defaults for now
struct mpt_hparams {
int32_t d_model = 0;
int32_t d_model = 0;
int32_t max_seq_len = 0;
int32_t n_heads = 0;
int32_t n_layers = 0;
int32_t n_vocab = 0;
int32_t ftype = 0;
int32_t n_heads = 0;
int32_t n_layers = 0;
int32_t n_vocab = 0;
int32_t ftype = 0;
};

struct replit_layer {
Expand Down Expand Up @@ -195,19 +195,24 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t
{
auto & hparams = model.hparams;

fin.read((char *)&hparams.d_model, sizeof(hparams.d_model));
fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));

printf("%s: d_model = %d\n", __func__, hparams.d_model);
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
fin.read((char *) &hparams.d_model, sizeof(hparams.d_model));
fin.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
fin.read((char *) &hparams.n_heads, sizeof(hparams.n_heads));
fin.read((char *) &hparams.n_layers, sizeof(hparams.n_layers));
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));

const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;

printf("%s: d_model = %d\n", __func__, hparams.d_model);
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
printf("%s: qntvr = %d\n", __func__, qntvr);

hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}

// load vocab
Expand Down
54 changes: 30 additions & 24 deletions examples/replit/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
#include <vector>

struct mpt_hparams {
int32_t d_model = 0;
int32_t d_model = 0;
int32_t max_seq_len = 0;
int32_t n_heads = 0;
int32_t n_layers = 0;
int32_t n_vocab = 0;
int32_t ftype = 0;
int32_t n_heads = 0;
int32_t n_layers = 0;
int32_t n_vocab = 0;
int32_t ftype = 0;
};

// quantize a model
Expand Down Expand Up @@ -59,26 +59,32 @@ bool mpt_model_quantize(const std::string & fname_inp,

// load hparams
{
finp.read((char *)&hparams.d_model, sizeof(hparams.d_model));
finp.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
finp.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
finp.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
finp.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
finp.read((char *)&hparams.ftype, sizeof(hparams.ftype));

printf("%s: d_model = %d\n", __func__, hparams.d_model);
finp.read((char *) &hparams.d_model, sizeof(hparams.d_model));
finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
finp.read((char *) &hparams.n_heads, sizeof(hparams.n_heads));
finp.read((char *) &hparams.n_layers, sizeof(hparams.n_layers));
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));

const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;

printf("%s: d_model = %d\n", __func__, hparams.d_model);
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: ftype = %d\n", __func__, hparams.ftype);

fout.write((char *)&hparams.d_model, sizeof(hparams.d_model));
fout.write((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
fout.write((char *)&hparams.n_heads, sizeof(hparams.n_heads));
fout.write((char *)&hparams.n_layers, sizeof(hparams.n_layers));
fout.write((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *)&ftype, sizeof(hparams.ftype));
printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);

fout.write((char *) &hparams.d_model, sizeof(hparams.d_model));
fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
fout.write((char *) &hparams.n_heads, sizeof(hparams.n_heads));
fout.write((char *) &hparams.n_layers, sizeof(hparams.n_layers));
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}

// load vocab
Expand Down

0 comments on commit c2fab8a

Please sign in to comment.