Skip to content

Commit

Permalink
ggml-alloc : v3 (#727)
Browse files Browse the repository at this point in the history
* ggml-alloc v3

ggml-ci

* fix ci

ggml-ci

* whisper : check for backend buffer allocation failures

* whisper : avoid leaks when initialization fails

* cleanup

ggml-ci

* style fixes

ggml-ci
  • Loading branch information
slaren committed Feb 11, 2024
1 parent ee07c17 commit 5070f07
Show file tree
Hide file tree
Showing 19 changed files with 1,458 additions and 1,583 deletions.
5 changes: 3 additions & 2 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -163,8 +163,9 @@ function gg_run_gpt_2 {
model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin"
prompts="../examples/prompts/gpt-2.txt"

(time ./bin/gpt-2-backend2 --model ${model} -s 1234 -n 64 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/gpt-2-backend2 --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/gpt-2-backend --model ${model} -s 1234 -n 64 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/gpt-2-backend --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/gpt-2-sched --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log

(time ./bin/gpt-2-batched --model ${model} -s 1234 -n 64 -np 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log

Expand Down
4 changes: 2 additions & 2 deletions examples/gpt-2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ set(TEST_TARGET gpt-2-backend)
add_executable(${TEST_TARGET} main-backend.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

set(TEST_TARGET gpt-2-backend2)
add_executable(${TEST_TARGET} main.cpp)
set(TEST_TARGET gpt-2-sched)
add_executable(${TEST_TARGET} main-sched.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

#
Expand Down
89 changes: 43 additions & 46 deletions examples/gpt-2/main-alloc.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "ggml/ggml.h"
#include "ggml/ggml-alloc.h"
#include "ggml/ggml-backend.h"

#include "common.h"
#include "common-ggml.h"
Expand Down Expand Up @@ -69,7 +70,7 @@ struct gpt2_model {
struct ggml_tensor * memory_v;

//
struct ggml_context * ctx;
struct ggml_context * ctx_w;
std::map<std::string, struct ggml_tensor *> tensors;
};

Expand Down Expand Up @@ -153,7 +154,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
return false;
}

auto & ctx = model.ctx;
auto & ctx = model.ctx_w;

size_t ctx_size = 0;

Expand Down Expand Up @@ -207,8 +208,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
/*.no_alloc =*/ false,
};

model.ctx = ggml_init(params);
if (!model.ctx) {
model.ctx_w = ggml_init(params);
if (!model.ctx_w) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
Expand Down Expand Up @@ -385,10 +386,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// build the computation graph
struct ggml_cgraph * gpt2_graph(
const gpt2_model & model,
struct ggml_allocr * allocr,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp) {
const int N = embd_inp.size();
const int n_tokens) {
const int N = n_tokens;

const auto & hparams = model.hparams;

Expand All @@ -404,28 +404,24 @@ struct ggml_cgraph * gpt2_graph(
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};

struct ggml_context * ctx0 = ggml_init(params);

struct ggml_cgraph * gf = ggml_new_graph(ctx0);

struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_allocr_alloc(allocr, embd);

// avoid writing to tensors if we are only measuring the memory usage
if (!ggml_allocr_is_measure(allocr)) {
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
}
// at this point, the tensor data is not allocated yet and cannot be set
// we will find the tensor after the graph is allocated by its name, and set the data then
ggml_set_name(embd, "embd");
// setting a tensor as an input will ensure that it is allocated at the beginning of the graph
// this is important to ensure that the input tensors are not overwritten before they are used
ggml_set_input(embd);

struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_allocr_alloc(allocr, position);
if (!ggml_allocr_is_measure(allocr)) {
for (int i = 0; i < N; ++i) {
((int32_t *) position->data)[i] = n_past + i;
}
}
ggml_set_name(position, "position");
ggml_set_input(position);

// wte + wpe
struct ggml_tensor * inpL =
Expand Down Expand Up @@ -655,6 +651,9 @@ struct ggml_cgraph * gpt2_graph(
// [ 768, 50257] - model.lm_head
// [ 768, N] - inpL
inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
ggml_set_name(inpL, "logits");
// setting a tensor as the output will ensure that it is not overwritten by subsequent operations
ggml_set_output(inpL);

// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
Expand All @@ -669,15 +668,15 @@ struct ggml_cgraph * gpt2_graph(
// evaluate the transformer
//
// - model: the model
// - allocr: ggml_allocr to use to allocate the compute buffer
// - allocr: ggml_gallocr to use to allocate the compute buffer
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted logits for the next token
//
bool gpt2_eval(
const gpt2_model & model,
struct ggml_allocr * allocr,
ggml_gallocr_t allocr,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
Expand All @@ -688,13 +687,19 @@ bool gpt2_eval(

const int n_vocab = hparams.n_vocab;

// reset the allocator to free all the memory allocated during the previous inference
ggml_allocr_reset(allocr);
struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp.size());

struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
// allocate the graph tensors
ggml_gallocr_alloc_graph(allocr, gf);

// allocate tensors
ggml_allocr_alloc_graph(allocr, gf);
// set the graph inputs
struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));

struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position");
for (int i = 0; i < N; ++i) {
((int32_t *) position->data)[i] = n_past + i;
}

// run the computation
struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
Expand All @@ -708,15 +713,15 @@ bool gpt2_eval(
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
//}

// in this case, the output tensor is the last one in the graph
struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
// get the graph outputs
struct ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");

//embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(logits), sizeof(float)*n_vocab*N);

// return result just for the last token
embd_w.resize(n_vocab);
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
memcpy(embd_w.data(), (float *) ggml_get_data(logits) + (n_vocab*(N-1)), sizeof(float)*n_vocab);

return true;
}
Expand Down Expand Up @@ -763,27 +768,19 @@ int main(int argc, char ** argv) {
test_gpt_tokenizer(vocab, params.token_test);
}

// keep this buffer alive while evaluating the model
std::vector<uint8_t> compute_buffer;

struct ggml_allocr * allocr = NULL;
ggml_gallocr_t allocr = NULL;
// allocate the compute buffer
{
allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
allocr = ggml_gallocr_new(ggml_backend_cpu_buffer_type());

// create the worst case graph for memory usage estimation
int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
int n_past = model.hparams.n_ctx - n_tokens;
struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));

// compute the required memory
size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;

// recreate the allocator with the required memory
ggml_allocr_free(allocr);
compute_buffer.resize(mem_size);
allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
struct ggml_cgraph * gf = gpt2_graph(model, n_past, n_tokens);

// pre-allocate the compute buffer for the worst case (optional)
ggml_gallocr_reserve(allocr, gf);
size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
}

Expand Down Expand Up @@ -880,7 +877,7 @@ int main(int argc, char ** argv) {
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}

ggml_free(model.ctx);
ggml_free(model.ctx_w);

return 0;
}
Loading

0 comments on commit 5070f07

Please sign in to comment.