Skip to content

Commit

Permalink
reusable buffers
Browse files Browse the repository at this point in the history
  • Loading branch information
mqy committed Jul 4, 2023
1 parent 8fa7e06 commit bf63002
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 134 deletions.
23 changes: 8 additions & 15 deletions examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1569,6 +1569,8 @@ int main(int argc, char ** argv) {
int n_tokens = model.hparams.n_ctx;
int n_vocab = model.hparams.n_vocab;

auto compute_plan_buffer = std::vector<uint8_t>();

for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = {
/*.mem_size =*/ compute_size,
Expand Down Expand Up @@ -1598,13 +1600,10 @@ int main(int argc, char ** argv) {
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

float error_before_opt = ggml_get_f32_1d(e, 0);
Expand All @@ -1625,13 +1624,10 @@ int main(int argc, char ** argv) {
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

float error_after_opt = ggml_get_f32_1d(e, 0);
Expand Down Expand Up @@ -1689,13 +1685,10 @@ int main(int argc, char ** argv) {
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
Expand Down
29 changes: 11 additions & 18 deletions examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,15 @@ int main(int argc, char ** argv) {
TENSOR_DUMP(m11);
TENSOR_DUMP(m2);

auto compute_plan_buffer = std::vector<uint8_t>();

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

TENSOR_DUMP(gf.nodes[0]);
Expand Down Expand Up @@ -229,15 +228,12 @@ int main(int argc, char ** argv) {
long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf31);
if (plan.work_data) {
free(plan.work_data);
}
}

long long int stop = ggml_time_us();
Expand Down Expand Up @@ -272,15 +268,12 @@ int main(int argc, char ** argv) {

// Running a different graph computation to make sure we override the CPU cache lines
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf32);
if (plan.work_data) {
free(plan.work_data);
}
}
}
printf("\n");
Expand Down
29 changes: 11 additions & 18 deletions examples/train-text-from-scratch/train-text-from-scratch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3181,6 +3181,8 @@ int main(int argc, char ** argv) {
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
}

auto compute_plan_buffer = std::vector<uint8_t>();

printf("%s: begin training\n", __func__);

for (int ex = 0; ex < params.n_examples; ++ex) {
Expand Down Expand Up @@ -3244,15 +3246,12 @@ int main(int argc, char ** argv) {
}

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, gf);
if (plan.work_data) {
free(plan.work_data);
}
}

size_t used_mem_before_opt = ggml_used_mem(ctx0);
Expand All @@ -3278,15 +3277,12 @@ int main(int argc, char ** argv) {
model.train_tokens += n_batch * n_tokens;

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, gf);
if (plan.work_data) {
free(plan.work_data);
}
}

float error_after_opt = ggml_get_f32_1d(loss, 0);
Expand Down Expand Up @@ -3376,15 +3372,12 @@ int main(int argc, char ** argv) {
ggml_build_forward_expand(&gf, logits);

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
Expand Down
3 changes: 2 additions & 1 deletion ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -16330,7 +16330,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
const struct ggml_cgraph * cgraph = state->shared->cgraph;

const struct ggml_graph_compute_plan * plan = state->shared->plan;
const int *n_tasks_arr = plan->n_tasks;
const int * n_tasks_arr = plan->n_tasks;

const int n_threads = state->shared->n_threads;
set_numa_thread_affinity(state->ith, n_threads);
Expand Down Expand Up @@ -16864,6 +16864,7 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
}
}

// TODO: avoid allocating memory frequently.
static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) {
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads);
if (plan.work_size > 0) {
Expand Down
2 changes: 1 addition & 1 deletion ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ extern "C" {
// Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
size_t work_size;
// Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
void * work_data;
uint8_t * work_data;

int n_threads;

Expand Down
64 changes: 25 additions & 39 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,10 @@ struct llama_context {
// input embedding (1-dimensional array: [n_embd])
std::vector<float> embedding;

// reusable buffer for `struct ggml_graph_compute_plan.work_data`
// std::vector guarantees the elements are stored contiguously.
std::vector<uint8_t> compute_plan_buffer;

// memory buffers used to evaluate the model
// TODO: move in llama_state
llama_ctx_buffer buf_compute;
Expand Down Expand Up @@ -1582,10 +1586,13 @@ static bool llama_eval_internal(
// run the computation
ggml_build_forward_expand(&gf, cur);

bool call_ggml_graph_compute = true;

#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur);
call_ggml_graph_compute = false;
} else {
// IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
Expand All @@ -1602,32 +1609,17 @@ static bool llama_eval_internal(
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
}

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}
}
#else
{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
#endif

if (call_ggml_graph_compute) {
auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
lctx.compute_plan_buffer.resize(plan.work_size);
plan.work_data = lctx.compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}
#endif

if (cgraph_fname) {
ggml_graph_export(&gf, cgraph_fname);
Expand Down Expand Up @@ -2815,6 +2807,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
// read tensors and apply
bool warned = false;
int n_tensors = 0;

auto compute_plan_buffer = std::vector<uint8_t>();

while (true) {
int32_t n_dims;
int32_t length;
Expand Down Expand Up @@ -2981,15 +2976,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
struct ggml_cgraph gf = ggml_build_forward(r);

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
auto plan = ggml_graph_compute_make_plan(&gf, n_threads);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
compute_plan_buffer.resize(plan.work_size);
plan.work_data = compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

// we won't need these tensors again, reset the context to save memory
Expand Down Expand Up @@ -3164,15 +3156,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
ctx->compute_plan_buffer.resize(plan.work_size);
plan.work_data = ctx->compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

ggml_free(cpy_ctx);
Expand Down Expand Up @@ -3280,15 +3269,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));

{
struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
if (plan.work_size > 0) {
plan.work_data = malloc(plan.work_size);
GGML_ASSERT(plan.work_data);
ctx->compute_plan_buffer.resize(plan.work_size);
plan.work_data = ctx->compute_plan_buffer.data();
}
ggml_graph_compute(&plan, &gf);
if (plan.work_data) {
free(plan.work_data);
}
}

ggml_free(cpy_ctx);
Expand Down
Loading

0 comments on commit bf63002

Please sign in to comment.