From fcf734c58122237de6fb39ad12820a6c22bf6edd Mon Sep 17 00:00:00 2001 From: Ravindra Marella Date: Thu, 18 May 2023 18:48:26 +0530 Subject: [PATCH 1/3] mpt : move global variable `n_ctx` to `mpt_hparams` --- examples/mpt/main.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp index 5a60367a3..25c2288aa 100644 --- a/examples/mpt/main.cpp +++ b/examples/mpt/main.cpp @@ -18,10 +18,9 @@ #include #include -int n_ctx = 4096; - // no defaults for now struct mpt_hparams { + int32_t n_ctx = 4096; int32_t d_model = 0; int32_t max_seq_len = 0; int32_t n_heads = 0; @@ -141,6 +140,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo { const auto & hparams = model.hparams; + const int32_t n_ctx = hparams.n_ctx; const size_t n_embd = hparams.d_model; const size_t n_layer = hparams.n_layers; const size_t n_vocab = hparams.n_vocab; @@ -220,6 +220,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo { const auto & hparams = model.hparams; + const int32_t n_ctx = hparams.n_ctx; const size_t n_embd = hparams.d_model; const size_t n_layer = hparams.n_layers; @@ -231,7 +232,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - printf("%s: memory_size = %8.2f MB, n_mem = %lld\n", __func__, memory_size / 1024.0 / 1024.0, n_mem); + printf("%s: memory_size = %8.2f MB, n_mem = %ld\n", __func__, memory_size / 1024.0 / 1024.0, n_mem); } // load weights @@ -332,6 +333,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, const auto & hparams = model.hparams; + const int32_t n_ctx = hparams.n_ctx; const int n_embd = hparams.d_model; const int n_layer = hparams.n_layers; const int n_head = hparams.n_heads; @@ -593,6 +595,7 @@ int main(int argc, char ** argv) { } printf("\n"); + const int32_t n_ctx = model.hparams.n_ctx; params.n_predict = std::min(params.n_predict, n_ctx - (int)embd_inp.size()); std::vector embd; From eec32d1b71e691b57eb3253cea0a581e11bedd5f Mon Sep 17 00:00:00 2001 From: Ravindra Marella Date: Sun, 21 May 2023 02:39:31 +0530 Subject: [PATCH 2/3] mpt : fix warnings --- examples/mpt/main.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp index e4caf7edd..095654209 100644 --- a/examples/mpt/main.cpp +++ b/examples/mpt/main.cpp @@ -203,7 +203,7 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo model.tensors["transformer.wte.weight"] = model.wte_weight; model.tensors["transformer.norm_f.weight"] = model.norm_f_weight; - for (int i = 0; i < n_layer; ++i) { + for (int i = 0; i < (int)n_layer; ++i) { auto & layer = model.layers[i]; layer.norm_1_weight = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); @@ -379,7 +379,8 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past, }; struct ggml_context * ctx0 = ggml_init(params); - struct ggml_cgraph gf = {.n_threads = n_threads}; + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd)); From b717faeaad89744ca281c099b84584d063723ae4 Mon Sep 17 00:00:00 2001 From: Ravindra Marella Date: Mon, 22 May 2023 20:56:03 +0530 Subject: [PATCH 3/3] mpt : fix `n_ctx` --- examples/mpt/main.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp index 095654209..c1e32ee1d 100644 --- a/examples/mpt/main.cpp +++ b/examples/mpt/main.cpp @@ -3,6 +3,7 @@ #include "common-ggml.h" #include "common.h" +#include #include #include #include @@ -19,10 +20,11 @@ #include // no defaults for now +// Here `n_ctx` is the max limit for context length. +// See https://github.com/ggerganov/ggml/pull/165#issuecomment-1556233670 struct mpt_hparams { - int32_t n_ctx = 4096; int32_t d_model = 0; - int32_t max_seq_len = 0; + int32_t n_ctx = 4096; int32_t n_heads = 0; int32_t n_layers = 0; int32_t n_vocab = 0; @@ -86,9 +88,10 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo // load hparams { auto & hparams = model.hparams; + int32_t n_ctx; fin.read((char *) &hparams.d_model, sizeof(hparams.d_model)); - fin.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); + fin.read((char *) &n_ctx, sizeof(hparams.n_ctx)); fin.read((char *) &hparams.n_heads, sizeof(hparams.n_heads)); fin.read((char *) &hparams.n_layers, sizeof(hparams.n_layers)); fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); @@ -96,10 +99,11 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo fin.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + hparams.n_ctx = std::min(n_ctx, hparams.n_ctx); const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; printf("%s: d_model = %d\n", __func__, hparams.d_model); - printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_heads = %d\n", __func__, hparams.n_heads); printf("%s: n_layers = %d\n", __func__, hparams.n_layers); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);