From a1e7c6922898c013ac67e0fa517531d8367da841 Mon Sep 17 00:00:00 2001 From: mqy Date: Tue, 27 Jun 2023 05:47:08 +0800 Subject: [PATCH 01/20] ggml_graph_compute: deprecate using ggml_context, try resolve issue #287 --- .../train-text-from-scratch.cpp | 2 - ggml.c | 89 +++++++++++++------ ggml.h | 26 +++++- 3 files changed, 86 insertions(+), 31 deletions(-) diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index c50eeb343bcef..7f7bf3b6fed53 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1426,11 +1426,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( gf->n_nodes = 0; gf->n_leafs = 0; - gf->work_size = 0; gf->perf_runs = 0; gf->perf_cycles = 0; gf->perf_time_us = 0; - gf->work = NULL; const auto & hparams = model->hparams; //const int n_ctx = hparams.n_ctx; diff --git a/ggml.c b/ggml.c index d257c3d657b34..0035066000af0 100644 --- a/ggml.c +++ b/ggml.c @@ -15773,8 +15773,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, - /*.work_size =*/ 0, - /*.work =*/ NULL, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, @@ -15946,6 +15944,7 @@ void clear_numa_thread_affinity(void) {} struct ggml_compute_state_shared { struct ggml_cgraph * cgraph; + struct ggml_cgraph_context * cgraph_ctx; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; @@ -15975,6 +15974,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_cgraph * cgraph = state->shared->cgraph; + struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx; const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); @@ -15989,8 +15989,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ ctx->work_size, + /*.wdata =*/ ctx->work_data, }; if (node_n != -1) { @@ -16057,8 +16057,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ ctx->work_size, + /*.wdata =*/ ctx->work_data, }; if (state->ith < node->n_tasks) { @@ -16069,23 +16069,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - const int n_threads = cgraph->n_threads; +// Prepare for graph computing. +// Will set: node->n_tasks, ctx->{work_size, planned} +void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) { + GGML_ASSERT(ctx); + // This function is actually reentrant, but duplicate calls is unnecessary. + GGML_ASSERT(ctx->work_size == 0); + GGML_ASSERT(ctx->work_data == NULL); + GGML_ASSERT(!ctx->planned); - struct ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.perf_node_start_cycles =*/ 0, - /*.perf_node_start_time_us =*/ 0, - /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, - /*.node_n =*/ -1, - }; - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + int n_threads = cgraph->n_threads; + size_t work_size = 0; // initialize tasks + work buffer { - size_t work_size = 0; - // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; @@ -16399,19 +16396,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; } } + } - if (cgraph->work != NULL && work_size > cgraph->work_size) { - GGML_ASSERT(false); // TODO: better handling - } + if (work_size > 0) { + work_size += CACHE_LINE_SIZE*(n_threads - 1); + } + + ctx->work_size = work_size; + ctx->work_data = NULL; + ctx->planned = true; +} - if (work_size > 0 && cgraph->work == NULL) { - cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); +void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) { + if (ctx == NULL) { + ctx = alloca(sizeof(struct ggml_cgraph_context)); + GGML_ASSERT(ctx); + ctx->work_size = 0; + ctx->work_data = NULL; + ctx->planned = false; + } else { + // The work_size and work_data MAY have default values even if has been planned. + if (ctx->work_size > 0) { + GGML_ASSERT(ctx->work_data); + } + } - GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); - cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); + if (!ctx->planned) { + ggml_graph_compute_plan(ctx, cgraph); + if (ctx->work_size > 0) { + ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8)); + GGML_ASSERT(ctx->work_data); + GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size); } } + const int n_threads = cgraph->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_ctx =*/ ctx, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + }; + struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + // create thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; ++j) { @@ -16463,6 +16494,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } } +// Deprecated, keep it only for backward compatibility. +void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { + UNUSED(ctx); + ggml_graph_compute_v2(NULL, cgraph); +} + void ggml_graph_reset(struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * grad = cgraph->grads[i]; diff --git a/ggml.h b/ggml.h index 24ca8ae221c75..f949fe35f6877 100644 --- a/ggml.h +++ b/ggml.h @@ -437,15 +437,23 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // graph compute context + struct ggml_cgraph_context { + // After call to `ggml_graph_compute_plan()`, `planned` is set as true, + // `work_size` will be updated as non-zero when buffer is required. When + // need buffer, caller MUST allocate memory for `work_data`. + // See https://github.com/ggerganov/ggml/issues/287 + size_t work_size; + void * work_data; + bool planned; // true means ready to compute graph nodes. + }; + // computation graph struct ggml_cgraph { int n_nodes; int n_leafs; int n_threads; - size_t work_size; - struct ggml_tensor * work; - struct ggml_tensor * nodes[GGML_MAX_NODES]; struct ggml_tensor * grads[GGML_MAX_NODES]; struct ggml_tensor * leafs[GGML_MAX_NODES]; @@ -1297,6 +1305,18 @@ extern "C" { GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); + // Since https://github.com/ggerganov/ggml/issues/287 + GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph); + // Since https://github.com/ggerganov/ggml/issues/287 + // When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`. + // Another use case: allocate buffer explicitly: + // - call `ggml_graph_compute_plan()`; + // - allocate memory for `ctx->work_data`; + // - finally call `ggml_graph_compute_v2()`. + // NOTE: don't manually set `ctx->planned`. + GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph); + // Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead. + // See https://github.com/ggerganov/ggml/issues/287 GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); From b11ac01f6b3985a8f41d9a99db076982a61bfec0 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 3 Jul 2023 16:00:47 +0800 Subject: [PATCH 02/20] rewrite: no longer consider backward compitability; plan and make_plan --- examples/baby-llama/baby-llama.cpp | 41 +++- examples/benchmark/benchmark-matmult.cpp | 46 +++- .../train-text-from-scratch.cpp | 41 +++- ggml.c | 229 ++++++++++-------- ggml.h | 52 ++-- llama.cpp | 68 +++++- tests/test-grad0.c | 66 ++++- tests/test-opt.c | 28 ++- 8 files changed, 405 insertions(+), 166 deletions(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 212f54d32cbad..f147c23a205b5 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1586,7 +1586,6 @@ int main(int argc, char ** argv) { int n_past = 0; ggml_cgraph gf = {}; - gf.n_threads = 1; get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets); @@ -1595,7 +1594,18 @@ int main(int argc, char ** argv) { struct ggml_tensor * e = square_error_loss(ctx0, targets, logits); ggml_build_forward_expand(&gf, e); - ggml_graph_compute(ctx0, &gf); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } float error_before_opt = ggml_get_f32_1d(e, 0); @@ -1611,7 +1621,18 @@ int main(int argc, char ** argv) { ggml_opt(ctx0, opt_params_lbfgs, e); // ggml_build_forward_expand(&gf, e); - ggml_graph_compute(ctx0, &gf); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } float error_after_opt = ggml_get_f32_1d(e, 0); @@ -1659,13 +1680,23 @@ int main(int argc, char ** argv) { struct ggml_context * ctx0 = ggml_init(params); ggml_cgraph gf = {}; - gf.n_threads = 1; int n_past = 0; struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past); ggml_build_forward_expand(&gf, logits); - ggml_graph_compute(ctx0, &gf); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 39d15caeb7779..e4f361e13fdec 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -159,13 +159,22 @@ int main(int argc, char ** argv) { // printf("Creating compute graph\n"); struct ggml_cgraph gf = ggml_build_forward(m11xm2); - gf.n_threads=benchmark_params.n_threads; - printf("cgraph->n_threads=%i\n",gf.n_threads); + printf("n_threads=%i\n", benchmark_params.n_threads); TENSOR_DUMP(m11); TENSOR_DUMP(m2); - ggml_graph_compute(ctx, &gf); + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } TENSOR_DUMP(gf.nodes[0]); @@ -187,7 +196,6 @@ int main(int argc, char ** argv) { // printf("Creating compute graph\n"); struct ggml_cgraph gf31 = ggml_build_forward(q31); - gf31.n_threads=benchmark_params.n_threads; // Set up a second graph computation to make sure we override the CPU cache lines // printf("Creating new tensor q12 & Running quantize\n"); @@ -199,8 +207,7 @@ int main(int argc, char ** argv) { //printf("Creating compute graph\n"); struct ggml_cgraph gf32 = ggml_build_forward(q32); - gf32.n_threads=benchmark_params.n_threads; - printf("cgraph->n_threads=%i\n",gf31.n_threads); + printf("n_threads=%i\n", benchmark_params.n_threads); const int dimx = sizex; const int dimy = sizey; @@ -221,14 +228,25 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); - ggml_graph_compute(ctx, &gf31); + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf31); + if (plan.work_data) { + free(plan.work_data); + } + } + long long int stop = ggml_time_us(); long long int usec = stop-start; double gflops = (double)(flops_per_matrix)/usec/1000.0; gflops_sum += gflops; printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n", i, - gf31.n_threads, + benchmark_params.n_threads, sizex, sizey, sizez, flops_per_matrix, usec,gflops); @@ -253,7 +271,17 @@ int main(int argc, char ** argv) { } // Running a different graph computation to make sure we override the CPU cache lines - ggml_graph_compute(ctx, &gf32); + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf32); + if (plan.work_data) { + free(plan.work_data); + } + } } printf("\n"); printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7f7bf3b6fed53..83da31531da57 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3215,9 +3215,6 @@ int main(int argc, char ** argv) { struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data; struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data; - // ggml_cgraph gf = {}; - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs); @@ -3246,7 +3243,17 @@ int main(int argc, char ** argv) { *gb = ggml_build_backward(ctx0, gf, true); } - ggml_graph_compute(ctx0, gf); + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, gf); + if (plan.work_data) { + free(plan.work_data); + } + } size_t used_mem_before_opt = ggml_used_mem(ctx0); @@ -3270,7 +3277,17 @@ int main(int argc, char ** argv) { model.train_samples += n_batch; model.train_tokens += n_batch * n_tokens; - ggml_graph_compute(ctx0, gf); + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, gf); + if (plan.work_data) { + free(plan.work_data); + } + } float error_after_opt = ggml_get_f32_1d(loss, 0); @@ -3352,13 +3369,23 @@ int main(int argc, char ** argv) { struct ggml_context * ctx0 = ggml_init(cparams); ggml_cgraph gf = {}; - gf.n_threads = params.n_threads; int n_past = 0; struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past); ggml_build_forward_expand(&gf, logits); - ggml_graph_compute(ctx0, &gf); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); //struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); diff --git a/ggml.c b/ggml.c index 0035066000af0..f019774e39116 100644 --- a/ggml.c +++ b/ggml.c @@ -4583,14 +4583,13 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.src0 =*/ NULL, /*.src1 =*/ NULL, /*.opt =*/ { NULL }, - /*.n_tasks =*/ 0, /*.perf_runs =*/ 0, /*.perf_cycles =*/ 0, /*.perf_time_us =*/ 0, /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - /*.pad =*/ { 0 }, + /*.padding =*/ { 0 }, }; // TODO: this should not be needed as long as we don't rely on aligned SIMD loads @@ -15772,7 +15771,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { struct ggml_cgraph result = { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, - /*.n_threads =*/ GGML_DEFAULT_N_THREADS, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, @@ -15944,7 +15942,7 @@ void clear_numa_thread_affinity(void) {} struct ggml_compute_state_shared { struct ggml_cgraph * cgraph; - struct ggml_cgraph_context * cgraph_ctx; + struct ggml_graph_compute_plan * cgraph_ctx; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; @@ -15974,7 +15972,9 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_cgraph * cgraph = state->shared->cgraph; - struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx; + + struct ggml_graph_compute_plan * ctx = state->shared->cgraph_ctx; + const int *n_tasks_arr = ctx->n_tasks; const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); @@ -15997,7 +15997,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /* FINALIZE */ struct ggml_tensor * node = state->shared->cgraph->nodes[node_n]; if (GGML_OP_HAS_FINALIZE[node->op]) { - params.nth = node->n_tasks; + params.nth = n_tasks_arr[node_n]; ggml_compute_forward(¶ms, node); ggml_graph_compute_perf_stats_node(node, state->shared); } @@ -16008,11 +16008,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes); struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; state->shared->perf_node_start_cycles = ggml_perf_cycles(); state->shared->perf_node_start_time_us = ggml_perf_time_us(); - params.nth = node->n_tasks; + params.nth = n_tasks; /* INIT */ if (GGML_OP_HAS_INIT[node->op]) { @@ -16020,7 +16021,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { ggml_compute_forward(¶ms, node); } - if (node->n_tasks == 1) { + if (n_tasks == 1) { // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) params.type = GGML_TASK_COMPUTE; @@ -16052,16 +16053,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /* COMPUTE */ struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = n_tasks_arr[node_n]; struct ggml_compute_params params = { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, - /*.nth =*/ node->n_tasks, + /*.nth =*/ n_tasks, /*.wsize =*/ ctx->work_size, /*.wdata =*/ ctx->work_data, }; - if (state->ith < node->n_tasks) { + if (state->ith < n_tasks) { ggml_compute_forward(¶ms, node); } } @@ -16070,15 +16072,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } // Prepare for graph computing. -// Will set: node->n_tasks, ctx->{work_size, planned} -void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) { - GGML_ASSERT(ctx); - // This function is actually reentrant, but duplicate calls is unnecessary. - GGML_ASSERT(ctx->work_size == 0); - GGML_ASSERT(ctx->work_data == NULL); - GGML_ASSERT(!ctx->planned); - - int n_threads = cgraph->n_threads; +struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, int n_threads) { + if (n_threads <= 0) { + n_threads = GGML_DEFAULT_N_THREADS; + } + + struct ggml_graph_compute_plan ctx; + memset(&ctx, 0, sizeof(struct ggml_graph_compute_plan)); + int * n_tasks = ctx.n_tasks; size_t work_size = 0; // initialize tasks + work buffer @@ -16091,11 +16092,11 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap case GGML_OP_CPY: case GGML_OP_DUP: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; size_t cur = 0; if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks[i]; } work_size = MAX(work_size, cur); @@ -16103,24 +16104,24 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap case GGML_OP_ADD: case GGML_OP_ADD1: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; size_t cur = 0; if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks[i]; } work_size = MAX(work_size, cur); } break; case GGML_OP_ACC: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; size_t cur = 0; if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads; + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks[i]; } work_size = MAX(work_size, cur); @@ -16144,7 +16145,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap case GGML_OP_ELU: case GGML_OP_RELU: { - node->n_tasks = 1; + n_tasks[i] = 1; } break; case GGML_OP_MUL: case GGML_OP_GELU: @@ -16155,32 +16156,32 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap case GGML_OP_RMS_NORM: case GGML_OP_RMS_NORM_BACK: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; } break; case GGML_OP_MUL_MAT: case GGML_OP_OUT_PROD: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; // TODO: use different scheduling for different matrix sizes //const int nr0 = ggml_nrows(node->src0); //const int nr1 = ggml_nrows(node->src1); - //node->n_tasks = MIN(n_threads, MAX(1, nr0/128)); - //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks); + //n_tasks[i] = MIN(n_threads, MAX(1, nr0/128)); + //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, n_tasks[i]); size_t cur = 0; const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type; #if defined(GGML_USE_CUBLAS) if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing + n_tasks[i] = 1; // TODO: this actually is doing nothing // the threads are still spinning } else #elif defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing + n_tasks[i] = 1; // TODO: this actually is doing nothing // the threads are still spinning cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); } @@ -16188,7 +16189,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - node->n_tasks = 1; // TODO: this actually is doing nothing + n_tasks[i] = 1; // TODO: this actually is doing nothing // the threads are still spinning if (node->src0->type != GGML_TYPE_F32) { // here we need memory just for single 2D matrix from src0 @@ -16206,7 +16207,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap } break; case GGML_OP_SCALE: { - node->n_tasks = 1; + n_tasks[i] = 1; } break; case GGML_OP_SET: case GGML_OP_CONT: @@ -16219,7 +16220,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap case GGML_OP_DIAG: case GGML_OP_DIAG_MASK_ZERO: { - node->n_tasks = 1; + n_tasks[i] = 1; } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: @@ -16227,19 +16228,19 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap case GGML_OP_ROPE: case GGML_OP_ROPE_BACK: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; } break; case GGML_OP_ALIBI: { - node->n_tasks = 1; //TODO + n_tasks[i] = 1; //TODO } break; case GGML_OP_CLAMP: { - node->n_tasks = 1; //TODO + n_tasks[i] = 1; //TODO } break; case GGML_OP_CONV_1D: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; GGML_ASSERT(node->src0->ne[3] == 1); GGML_ASSERT(node->src1->ne[2] == 1); @@ -16268,7 +16269,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap } break; case GGML_OP_CONV_2D: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; GGML_ASSERT(node->src1->ne[3] == 1); @@ -16303,45 +16304,45 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap } break; case GGML_OP_FLASH_ATTN: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; size_t cur = 0; const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) + cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) + cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_FF: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; size_t cur = 0; if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2 } work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_ATTN_BACK: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; size_t cur = 0; @@ -16349,13 +16350,13 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) + cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2 } if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1) - cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2 + cur = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) + cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2 } work_size = MAX(work_size, cur); @@ -16368,27 +16369,27 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap case GGML_OP_MAP_CUSTOM2: case GGML_OP_MAP_CUSTOM3: { - node->n_tasks = 1; + n_tasks[i] = 1; } break; case GGML_OP_CROSS_ENTROPY_LOSS: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; - size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks); + size_t cur = ggml_type_size(node->type)*(n_tasks[i] + node->src0->ne[0]*n_tasks[i]); work_size = MAX(work_size, cur); } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { - node->n_tasks = n_threads; + n_tasks[i] = n_threads; - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks; + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks[i]; work_size = MAX(work_size, cur); } break; case GGML_OP_NONE: { - node->n_tasks = 1; + n_tasks[i] = 1; } break; case GGML_OP_COUNT: { @@ -16402,35 +16403,31 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap work_size += CACHE_LINE_SIZE*(n_threads - 1); } - ctx->work_size = work_size; - ctx->work_data = NULL; - ctx->planned = true; + ctx.n_threads = n_threads; + ctx.work_size = work_size; + ctx.work_data = NULL; + + return ctx; } -void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) { - if (ctx == NULL) { - ctx = alloca(sizeof(struct ggml_cgraph_context)); +void ggml_graph_compute(struct ggml_graph_compute_plan * ctx, struct ggml_cgraph * cgraph) { + { GGML_ASSERT(ctx); - ctx->work_size = 0; - ctx->work_data = NULL; - ctx->planned = false; - } else { - // The work_size and work_data MAY have default values even if has been planned. + GGML_ASSERT(ctx->n_threads > 0); + if (ctx->work_size > 0) { GGML_ASSERT(ctx->work_data); } - } - if (!ctx->planned) { - ggml_graph_compute_plan(ctx, cgraph); - if (ctx->work_size > 0) { - ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8)); - GGML_ASSERT(ctx->work_data); - GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size); + for (int i = 0; i < cgraph->n_nodes; ++i) { + if (cgraph->nodes[i]->op != GGML_OP_NONE) { + GGML_ASSERT(ctx->n_tasks[i] > 0); + } } + } - const int n_threads = cgraph->n_threads; + const int n_threads = ctx->n_threads; struct ggml_compute_state_shared state_shared = { /*.cgraph =*/ cgraph, @@ -16494,12 +16491,6 @@ void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph } } -// Deprecated, keep it only for backward compatibility. -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - UNUSED(ctx); - ggml_graph_compute_v2(NULL, cgraph); -} - void ggml_graph_reset(struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * grad = cgraph->grads[i]; @@ -16548,14 +16539,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char const int64_t * ne = tensor->ne; const size_t * nb = tensor->nb; - fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n", + fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n", arg, ggml_type_name(tensor->type), ggml_op_name (tensor->op), tensor->n_dims, ne[0], ne[1], ne[2], ne[3], nb[0], nb[1], nb[2], nb[3], - tensor->n_tasks, tensor->data, tensor->name); } @@ -17283,7 +17273,6 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g // static enum ggml_opt_result ggml_opt_adam( - struct ggml_context * ctx, struct ggml_opt_context * opt, struct ggml_opt_params params, struct ggml_tensor * f, @@ -17291,9 +17280,6 @@ static enum ggml_opt_result ggml_opt_adam( struct ggml_cgraph * gb) { GGML_ASSERT(ggml_is_scalar(f)); - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - // these will store the parameters we want to optimize struct ggml_tensor * ps[GGML_MAX_PARAMS]; @@ -17340,7 +17326,18 @@ static enum ggml_opt_result ggml_opt_adam( // compute the function value ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, gb); + if (plan.work_data) { + free(plan.work_data); + } + } opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -17420,7 +17417,18 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, gb); + if (plan.work_data) { + free(plan.work_data); + } + } const float fx = ggml_get_f32_1d(f, 0); @@ -17491,7 +17499,6 @@ struct ggml_lbfgs_iteration_data { }; static enum ggml_opt_result linesearch_backtracking( - struct ggml_context * ctx, const struct ggml_opt_params * params, int nx, float * x, @@ -17542,7 +17549,18 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params->n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, gb); + if (plan.work_data) { + free(plan.work_data); + } + } ggml_opt_get_grad(np, ps, g); @@ -17610,9 +17628,6 @@ static enum ggml_opt_result ggml_opt_lbfgs( } } - gf->n_threads = params.n_threads; - gb->n_threads = params.n_threads; - const int m = params.lbfgs.m; // these will store the parameters we want to optimize @@ -17664,7 +17679,17 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx, gb); + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, gb); + if (plan.work_data) { + free(plan.work_data); + } + } ggml_opt_get_grad(np, ps, g); @@ -17723,7 +17748,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_vec_cpy_f32(nx, xp, x); ggml_vec_cpy_f32(nx, gp, g); - ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps); + ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps); if (ls < 0) { // linesearch failed - go back to the previous point and return @@ -18025,7 +18050,7 @@ enum ggml_opt_result ggml_opt_resume_g( switch (opt->params.type) { case GGML_OPT_ADAM: { - result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb); + result = ggml_opt_adam(opt, opt->params, f, gf, gb); } break; case GGML_OPT_LBFGS: { diff --git a/ggml.h b/ggml.h index f949fe35f6877..f92f428fa2090 100644 --- a/ggml.h +++ b/ggml.h @@ -65,7 +65,16 @@ // ggml_set_f32(a, 3.0f); // ggml_set_f32(b, 4.0f); // -// ggml_graph_compute(ctx0, &gf); +// const int n_threads = 1; +// struct ggml_graph_compute_plan ctx = ggml_graph_compute_make_plan(&gf, n_threads); +// if (ctx.work_size > 0) { +// ctx.work_data = malloc(ctx.work_size); +// GGML_ASSERT(ctx.work_data); +// } +// ggml_graph_compute(&ctx, &gf); +// if (ctx.work_data) { +// free(ctx.work_data); +// } // // printf("f = %f\n", ggml_get_f32_1d(f, 0)); // @@ -418,9 +427,6 @@ extern "C" { struct ggml_tensor * src1; struct ggml_tensor * opt[GGML_MAX_OPT]; - // thread scheduling - int n_tasks; - // performance int perf_runs; int64_t perf_cycles; @@ -432,27 +438,30 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[4]; + char padding[8]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); - // graph compute context - struct ggml_cgraph_context { - // After call to `ggml_graph_compute_plan()`, `planned` is set as true, - // `work_size` will be updated as non-zero when buffer is required. When - // need buffer, caller MUST allocate memory for `work_data`. - // See https://github.com/ggerganov/ggml/issues/287 + // The default graph compute plan that needs to be prepared for ggml_graph_compute(). + // Since https://github.com/ggerganov/ggml/issues/287 + struct ggml_graph_compute_plan { + // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`. size_t work_size; + // Worker buffer. + // Expect allocate/free by caller before/after calling to `ggml_graph_compute()`. void * work_data; - bool planned; // true means ready to compute graph nodes. + + int n_threads; + + // The `n_tasks` of nodes, 1:1 mapping to cgraph nodes. + int n_tasks[GGML_MAX_NODES]; }; // computation graph struct ggml_cgraph { int n_nodes; int n_leafs; - int n_threads; struct ggml_tensor * nodes[GGML_MAX_NODES]; struct ggml_tensor * grads[GGML_MAX_NODES]; @@ -1305,19 +1314,10 @@ extern "C" { GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); - // Since https://github.com/ggerganov/ggml/issues/287 - GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph); - // Since https://github.com/ggerganov/ggml/issues/287 - // When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`. - // Another use case: allocate buffer explicitly: - // - call `ggml_graph_compute_plan()`; - // - allocate memory for `ctx->work_data`; - // - finally call `ggml_graph_compute_v2()`. - // NOTE: don't manually set `ctx->planned`. - GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph); - // Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead. - // See https://github.com/ggerganov/ggml/issues/287 - GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); + // ggml_graph_compute_make_plan() needs to be called before ggml_graph_compute(). + // Returns a plan object. When plan.work_size > 0, caller must allocate memory for plan.work_data. + GGML_API struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, const int n_threads/*=GGML_DEFAULT_N_THREADS*/); + GGML_API void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); diff --git a/llama.cpp b/llama.cpp index 02afdeb14078f..d1ae57298b3eb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1309,7 +1309,7 @@ static bool llama_eval_internal( // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; - gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + const int actual_n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -1612,10 +1612,30 @@ static bool llama_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); } - ggml_graph_compute(ctx0, &gf); + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } } #else - ggml_graph_compute(ctx0, &gf); + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } #endif if (cgraph_fname) { @@ -2966,8 +2986,18 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const } struct ggml_cgraph gf = ggml_build_forward(r); - gf.n_threads = n_threads; - ggml_graph_compute(lora_ctx, &gf); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -3120,7 +3150,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; - gf.n_threads = 1; ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kout3d->data = out; @@ -3140,7 +3169,18 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - ggml_graph_compute(cpy_ctx, &gf); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } ggml_free(cpy_ctx); } @@ -3226,7 +3266,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; - gf.n_threads = 1; ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer); kin3d->data = (void *) inp; @@ -3246,7 +3285,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - ggml_graph_compute(cpy_ctx, &gf); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } ggml_free(cpy_ctx); } diff --git a/tests/test-grad0.c b/tests/test-grad0.c index a3e25214b84eb..11bb2307f627e 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -215,15 +215,36 @@ bool check_gradient( } struct ggml_cgraph gf = ggml_build_forward (f); - gf.n_threads = n_threads; struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); - gb.n_threads = n_threads; ggml_graph_compute(ctx0, &gf); + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } + ggml_graph_reset (&gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gb); + if (plan.work_data) { + free(plan.work_data); + } + } // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); // ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot"); @@ -236,12 +257,34 @@ bool check_gradient( const float xm = x0 - eps; const float xp = x0 + eps; set_element(x[i], k, xp); - ggml_graph_compute(ctx0, &gf); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } const float f0 = ggml_get_f32_1d(f, 0); set_element(x[i], k, xm); - ggml_graph_compute(ctx0, &gf); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gf); + if (plan.work_data) { + free(plan.work_data); + } + } const float f1 = ggml_get_f32_1d(f, 0); @@ -252,7 +295,18 @@ bool check_gradient( // compute gradient using backward graph ggml_graph_reset (&gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute(ctx0, &gb); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &gb); + if (plan.work_data) { + free(plan.work_data); + } + } const float g1 = get_element(x[i]->grad, k); diff --git a/tests/test-opt.c b/tests/test-opt.c index d001615ee353b..cb0d58199991a 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.c @@ -140,7 +140,19 @@ int main(int argc, const char ** argv) { struct ggml_cgraph ge = ggml_build_forward(e); ggml_graph_reset (&ge); - ggml_graph_compute(ctx, &ge); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &ge); + if (plan.work_data) { + free(plan.work_data); + } + } + const float fe = ggml_get_f32_1d(e, 0); printf("%s: e = %.4f\n", __func__, fe); @@ -149,7 +161,19 @@ int main(int argc, const char ** argv) { ggml_opt(ctx, opt_params, e); ggml_graph_reset (&ge); - ggml_graph_compute(ctx, &ge); + + { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + ggml_graph_compute(&plan, &ge); + if (plan.work_data) { + free(plan.work_data); + } + } + const float fe_opt = ggml_get_f32_1d(e, 0); printf("%s: original e = %.4f\n", __func__, fe); printf("%s: optimized e = %.4f\n", __func__, fe_opt); From a37de23953ed794e1f8b100156b31f909c245edb Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 3 Jul 2023 16:22:52 +0800 Subject: [PATCH 03/20] minor: rename ctx as plan; const --- ggml.c | 50 +++++++++++++++++++++++++------------------------- ggml.h | 14 +++++++------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/ggml.c b/ggml.c index f019774e39116..4968f36c25edb 100644 --- a/ggml.c +++ b/ggml.c @@ -15941,13 +15941,13 @@ void clear_numa_thread_affinity(void) {} #endif struct ggml_compute_state_shared { - struct ggml_cgraph * cgraph; - struct ggml_graph_compute_plan * cgraph_ctx; + const struct ggml_cgraph * cgraph; + const struct ggml_graph_compute_plan * plan; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; - int n_threads; + const int n_threads; // synchronization primitives atomic_int n_active; // num active threads @@ -15971,10 +15971,10 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; - struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cgraph * cgraph = state->shared->cgraph; - struct ggml_graph_compute_plan * ctx = state->shared->cgraph_ctx; - const int *n_tasks_arr = ctx->n_tasks; + const struct ggml_graph_compute_plan * plan = state->shared->plan; + const int *n_tasks_arr = plan->n_tasks; const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); @@ -15989,8 +15989,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ ctx->work_size, - /*.wdata =*/ ctx->work_data, + /*.wsize =*/ plan->work_size, + /*.wdata =*/ plan->work_data, }; if (node_n != -1) { @@ -16059,8 +16059,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, /*.nth =*/ n_tasks, - /*.wsize =*/ ctx->work_size, - /*.wdata =*/ ctx->work_data, + /*.wsize =*/ plan->work_size, + /*.wdata =*/ plan->work_data, }; if (state->ith < n_tasks) { @@ -16077,9 +16077,9 @@ struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * n_threads = GGML_DEFAULT_N_THREADS; } - struct ggml_graph_compute_plan ctx; - memset(&ctx, 0, sizeof(struct ggml_graph_compute_plan)); - int * n_tasks = ctx.n_tasks; + struct ggml_graph_compute_plan plan; + memset(&plan, 0, sizeof(struct ggml_graph_compute_plan)); + int * n_tasks = plan.n_tasks; size_t work_size = 0; // initialize tasks + work buffer @@ -16403,35 +16403,35 @@ struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * work_size += CACHE_LINE_SIZE*(n_threads - 1); } - ctx.n_threads = n_threads; - ctx.work_size = work_size; - ctx.work_data = NULL; + plan.n_threads = n_threads; + plan.work_size = work_size; + plan.work_data = NULL; - return ctx; + return plan; } -void ggml_graph_compute(struct ggml_graph_compute_plan * ctx, struct ggml_cgraph * cgraph) { +void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph) { { - GGML_ASSERT(ctx); - GGML_ASSERT(ctx->n_threads > 0); + GGML_ASSERT(plan); + GGML_ASSERT(plan->n_threads > 0); - if (ctx->work_size > 0) { - GGML_ASSERT(ctx->work_data); + if (plan->work_size > 0) { + GGML_ASSERT(plan->work_data); } for (int i = 0; i < cgraph->n_nodes; ++i) { if (cgraph->nodes[i]->op != GGML_OP_NONE) { - GGML_ASSERT(ctx->n_tasks[i] > 0); + GGML_ASSERT(plan->n_tasks[i] > 0); } } } - const int n_threads = ctx->n_threads; + const int n_threads = plan->n_threads; struct ggml_compute_state_shared state_shared = { /*.cgraph =*/ cgraph, - /*.cgraph_ctx =*/ ctx, + /*.cgraph_plan =*/ plan, /*.perf_node_start_cycles =*/ 0, /*.perf_node_start_time_us =*/ 0, /*.n_threads =*/ n_threads, diff --git a/ggml.h b/ggml.h index f92f428fa2090..fae63e6312c41 100644 --- a/ggml.h +++ b/ggml.h @@ -66,14 +66,14 @@ // ggml_set_f32(b, 4.0f); // // const int n_threads = 1; -// struct ggml_graph_compute_plan ctx = ggml_graph_compute_make_plan(&gf, n_threads); -// if (ctx.work_size > 0) { -// ctx.work_data = malloc(ctx.work_size); -// GGML_ASSERT(ctx.work_data); +// struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); +// if (plan.work_size > 0) { +// plan.work_data = malloc(plan.work_size); +// GGML_ASSERT(plan.work_data); // } -// ggml_graph_compute(&ctx, &gf); -// if (ctx.work_data) { -// free(ctx.work_data); +// ggml_graph_compute(&plan, &gf); +// if (plan.work_data) { +// free(plan.work_data); // } // // printf("f = %f\n", ggml_get_f32_1d(f, 0)); From db81f33ef2067d000576fbc24d3e32a5f5d1ba0e Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 3 Jul 2023 18:10:00 +0800 Subject: [PATCH 04/20] remove ggml_graph_compute from tests/test-grad0.c, but current change breaks backward --- tests/test-grad0.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test-grad0.c b/tests/test-grad0.c index 11bb2307f627e..477fedfeef63e 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -218,7 +218,6 @@ bool check_gradient( struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); - ggml_graph_compute(ctx0, &gf); { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); if (plan.work_size > 0) { From 2b502c32caa5f5f5a996bd8ba73ae8120767eb9b Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 3 Jul 2023 20:28:07 +0800 Subject: [PATCH 05/20] add static ggml_graph_compute_sugar() --- ggml.c | 64 +++++++++++++++++----------------------------------------- 1 file changed, 19 insertions(+), 45 deletions(-) diff --git a/ggml.c b/ggml.c index 4968f36c25edb..0e906d0c3dde7 100644 --- a/ggml.c +++ b/ggml.c @@ -16424,7 +16424,6 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap GGML_ASSERT(plan->n_tasks[i] > 0); } } - } const int n_threads = plan->n_threads; @@ -16491,6 +16490,20 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap } } +static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) { + struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads); + if (plan.work_size > 0) { + plan.work_data = malloc(plan.work_size); + GGML_ASSERT(plan.work_data); + } + + ggml_graph_compute(&plan, cgraph); + + if (plan.work_data) { + free(plan.work_data); + } +} + void ggml_graph_reset(struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * grad = cgraph->grads[i]; @@ -17327,17 +17340,7 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } - ggml_graph_compute(&plan, gb); - if (plan.work_data) { - free(plan.work_data); - } - } + ggml_graph_compute_sugar(gb, params.n_threads); opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -17418,17 +17421,7 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } - ggml_graph_compute(&plan, gb); - if (plan.work_data) { - free(plan.work_data); - } - } + ggml_graph_compute_sugar(gb, params.n_threads); const float fx = ggml_get_f32_1d(f, 0); @@ -17550,17 +17543,7 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params->n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } - ggml_graph_compute(&plan, gb); - if (plan.work_data) { - free(plan.work_data); - } - } + ggml_graph_compute_sugar(gb, params->n_threads); ggml_opt_get_grad(np, ps, g); @@ -17679,17 +17662,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } - ggml_graph_compute(&plan, gb); - if (plan.work_data) { - free(plan.work_data); - } - } + + ggml_graph_compute_sugar(gb, params.n_threads); ggml_opt_get_grad(np, ps, g); From cb1dec0ec04228aa13137caebe951cd7843a6816 Mon Sep 17 00:00:00 2001 From: mqy Date: Mon, 3 Jul 2023 23:58:31 +0800 Subject: [PATCH 06/20] minor: update comments --- ggml.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml.h b/ggml.h index fae63e6312c41..0f1bd138bff29 100644 --- a/ggml.h +++ b/ggml.h @@ -448,8 +448,7 @@ extern "C" { struct ggml_graph_compute_plan { // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`. size_t work_size; - // Worker buffer. - // Expect allocate/free by caller before/after calling to `ggml_graph_compute()`. + // Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`. void * work_data; int n_threads; From b1331d7e604eeae9b9b0e4f7b3a50b70b49c1b44 Mon Sep 17 00:00:00 2001 From: mqy Date: Tue, 4 Jul 2023 20:38:46 +0800 Subject: [PATCH 07/20] reusable buffers --- examples/baby-llama/baby-llama.cpp | 23 +++---- examples/benchmark/benchmark-matmult.cpp | 29 ++++----- .../train-text-from-scratch.cpp | 29 ++++----- ggml.c | 3 +- ggml.h | 2 +- llama.cpp | 64 ++++++++----------- tests/test-grad0.c | 64 +++++++++++-------- tests/test-opt.c | 46 +++++++++---- 8 files changed, 126 insertions(+), 134 deletions(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index f147c23a205b5..785e7e8860fff 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1569,6 +1569,8 @@ int main(int argc, char ** argv) { int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; + auto compute_plan_buffer = std::vector(); + for (int ex=0; ex 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } float error_before_opt = ggml_get_f32_1d(e, 0); @@ -1625,13 +1624,10 @@ int main(int argc, char ** argv) { { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } float error_after_opt = ggml_get_f32_1d(e, 0); @@ -1689,13 +1685,10 @@ int main(int argc, char ** argv) { { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index e4f361e13fdec..e7d75c9ae51e0 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -164,16 +164,15 @@ int main(int argc, char ** argv) { TENSOR_DUMP(m11); TENSOR_DUMP(m2); + auto compute_plan_buffer = std::vector(); + { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads); + auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } TENSOR_DUMP(gf.nodes[0]); @@ -229,15 +228,12 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads); + auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf31); - if (plan.work_data) { - free(plan.work_data); - } } long long int stop = ggml_time_us(); @@ -272,15 +268,12 @@ int main(int argc, char ** argv) { // Running a different graph computation to make sure we override the CPU cache lines { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads); + auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf32); - if (plan.work_data) { - free(plan.work_data); - } } } printf("\n"); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 83da31531da57..0345b8dc02748 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3181,6 +3181,8 @@ int main(int argc, char ** argv) { GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); } + auto compute_plan_buffer = std::vector(); + printf("%s: begin training\n", __func__); for (int ex = 0; ex < params.n_examples; ++ex) { @@ -3244,15 +3246,12 @@ int main(int argc, char ** argv) { } { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads); + auto plan = ggml_graph_compute_make_plan(gf, params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, gf); - if (plan.work_data) { - free(plan.work_data); - } } size_t used_mem_before_opt = ggml_used_mem(ctx0); @@ -3278,15 +3277,12 @@ int main(int argc, char ** argv) { model.train_tokens += n_batch * n_tokens; { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads); + auto plan = ggml_graph_compute_make_plan(gf, params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, gf); - if (plan.work_data) { - free(plan.work_data); - } } float error_after_opt = ggml_get_f32_1d(loss, 0); @@ -3376,15 +3372,12 @@ int main(int argc, char ** argv) { ggml_build_forward_expand(&gf, logits); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads); + auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); diff --git a/ggml.c b/ggml.c index 0e906d0c3dde7..94a71070612cc 100644 --- a/ggml.c +++ b/ggml.c @@ -15974,7 +15974,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const struct ggml_cgraph * cgraph = state->shared->cgraph; const struct ggml_graph_compute_plan * plan = state->shared->plan; - const int *n_tasks_arr = plan->n_tasks; + const int * n_tasks_arr = plan->n_tasks; const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); @@ -16490,6 +16490,7 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap } } +// TODO: avoid allocating memory frequently. static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads); if (plan.work_size > 0) { diff --git a/ggml.h b/ggml.h index 0f1bd138bff29..1b50ab8666ed6 100644 --- a/ggml.h +++ b/ggml.h @@ -449,7 +449,7 @@ extern "C" { // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`. size_t work_size; // Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`. - void * work_data; + uint8_t * work_data; int n_threads; diff --git a/llama.cpp b/llama.cpp index d1ae57298b3eb..c29d46d8dd596 100644 --- a/llama.cpp +++ b/llama.cpp @@ -321,6 +321,10 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; + // reusable buffer for `struct ggml_graph_compute_plan.work_data` + // std::vector guarantees the elements are stored contiguously. + std::vector compute_plan_buffer; + // memory buffers used to evaluate the model // TODO: move in llama_state llama_ctx_buffer buf_compute; @@ -1591,10 +1595,13 @@ static bool llama_eval_internal( // run the computation ggml_build_forward_expand(&gf, cur); + bool call_ggml_graph_compute = true; + #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); + call_ggml_graph_compute = false; } else { // IMPORTANT: // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla @@ -1611,32 +1618,17 @@ static bool llama_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k); ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); } - - { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } - ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } - } } -#else - { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); +#endif + + if (call_ggml_graph_compute) { + auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + lctx.compute_plan_buffer.resize(plan.work_size); + plan.work_data = lctx.compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } -#endif if (cgraph_fname) { ggml_graph_export(&gf, cgraph_fname); @@ -2822,6 +2814,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const // read tensors and apply bool warned = false; int n_tensors = 0; + + auto compute_plan_buffer = std::vector(); + while (true) { int32_t n_dims; int32_t length; @@ -2988,15 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const struct ggml_cgraph gf = ggml_build_forward(r); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); + auto plan = ggml_graph_compute_make_plan(&gf, n_threads); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + compute_plan_buffer.resize(plan.work_size); + plan.work_data = compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } // we won't need these tensors again, reset the context to save memory @@ -3171,15 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); + auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + ctx->compute_plan_buffer.resize(plan.work_size); + plan.work_data = ctx->compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } ggml_free(cpy_ctx); @@ -3287,15 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); + auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + ctx->compute_plan_buffer.resize(plan.work_size); + plan.work_data = ctx->compute_plan_buffer.data(); } ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } ggml_free(cpy_ctx); diff --git a/tests/test-grad0.c b/tests/test-grad0.c index 477fedfeef63e..548547727efdc 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -191,6 +191,32 @@ void print_elements(const char* label, const struct ggml_tensor * t) { } +struct compute_plan_buffer { + size_t size; + uint8_t * data; +}; + +static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) { + if (size == 0) { + return NULL; + } + + GGML_ASSERT(buf); + + if (buf->size == 0) { + buf->data = malloc(size); + buf->size = size; + } else if (buf->size < size) { + buf->data = realloc(buf->data, size); + buf->size = size; + } else { + // skip shrinking. + } + + GGML_ASSERT(buf->data); + return buf->data; +} + bool check_gradient( const char * op_name, struct ggml_context * ctx0, @@ -218,6 +244,8 @@ bool check_gradient( struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL }; + { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); if (plan.work_size > 0) { @@ -235,14 +263,8 @@ bool check_gradient( { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &gb); - if (plan.work_data) { - free(plan.work_data); - } } // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); @@ -259,14 +281,8 @@ bool check_gradient( { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } const float f0 = ggml_get_f32_1d(f, 0); @@ -275,14 +291,8 @@ bool check_gradient( { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); - } } const float f1 = ggml_get_f32_1d(f, 0); @@ -297,14 +307,8 @@ bool check_gradient( { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &gb); - if (plan.work_data) { - free(plan.work_data); - } } const float g1 = get_element(x[i]->grad, k); @@ -321,6 +325,10 @@ bool check_gradient( } } + if (plan_buf.data) { + free(plan_buf.data); + } + return true; } diff --git a/tests/test-opt.c b/tests/test-opt.c index cb0d58199991a..35d070dc7a095 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.c @@ -114,6 +114,31 @@ void set_element(struct ggml_tensor * t, int idx, float value) { ((float *)t->data)[idx] = value; } + +struct compute_plan_buffer { + size_t size; + uint8_t * data; +}; + +static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) { + if (size == 0) { + return NULL; + } + + if (buf->size == 0) { + buf->data = malloc(size); + buf->size = size; + } else if (buf->size < size) { + buf->data = realloc(buf->data, size); + buf->size = size; + } else { + // skip shrinking. + } + + GGML_ASSERT(buf->data); + return buf->data; +} + int main(int argc, const char ** argv) { struct ggml_init_params params = { .mem_size = 1024*1024*1024, @@ -141,16 +166,11 @@ int main(int argc, const char ** argv) { struct ggml_cgraph ge = ggml_build_forward(e); ggml_graph_reset (&ge); + struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL }; { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &ge); - if (plan.work_data) { - free(plan.work_data); - } } const float fe = ggml_get_f32_1d(e, 0); @@ -164,14 +184,12 @@ int main(int argc, const char ** argv) { { struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); - } + plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); ggml_graph_compute(&plan, &ge); - if (plan.work_data) { - free(plan.work_data); - } + } + + if (plan_buf.data) { + free(plan_buf.data); } const float fe_opt = ggml_get_f32_1d(e, 0); From 53cfb4b9957a54f20f25089da40aa9718e41aad7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 20:23:08 +0300 Subject: [PATCH 08/20] ggml : more consistent naming + metal fixes --- examples/baby-llama/baby-llama.cpp | 32 ++++---- examples/benchmark/benchmark-matmult.cpp | 32 ++++---- examples/metal/metal.cpp | 3 +- .../train-text-from-scratch.cpp | 34 ++++---- ggml-metal.h | 6 +- ggml-metal.m | 11 ++- ggml.c | 77 ++++++++++--------- ggml.h | 24 +++--- llama.cpp | 54 ++++++------- tests/CMakeLists.txt | 4 +- tests/test-grad0.c | 65 ++++++++-------- tests/test-opt.c | 27 +++---- 12 files changed, 194 insertions(+), 175 deletions(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 785e7e8860fff..5d66089b1e22e 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1569,7 +1569,7 @@ int main(int argc, char ** argv) { int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; - auto compute_plan_buffer = std::vector(); + std::vector work_buffer; for (int ex=0; ex 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } float error_before_opt = ggml_get_f32_1d(e, 0); @@ -1622,12 +1622,12 @@ int main(int argc, char ** argv) { ggml_build_forward_expand(&gf, e); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } float error_after_opt = ggml_get_f32_1d(e, 0); @@ -1683,12 +1683,12 @@ int main(int argc, char ** argv) { ggml_build_forward_expand(&gf, logits); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index e7d75c9ae51e0..840f4fe525cfb 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -164,15 +164,15 @@ int main(int argc, char ** argv) { TENSOR_DUMP(m11); TENSOR_DUMP(m2); - auto compute_plan_buffer = std::vector(); + std::vector work_buffer; { - auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, benchmark_params.n_threads); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } TENSOR_DUMP(gf.nodes[0]); @@ -228,12 +228,12 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); { - auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf31 = ggml_graph_plan(&gf31, benchmark_params.n_threads); + if (pf31.work_size > 0) { + work_buffer.resize(pf31.work_size); + pf31.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf31); + ggml_graph_compute(&gf31, &pf31); } long long int stop = ggml_time_us(); @@ -268,12 +268,12 @@ int main(int argc, char ** argv) { // Running a different graph computation to make sure we override the CPU cache lines { - auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf32 = ggml_graph_plan(&gf32, benchmark_params.n_threads); + if (pf32.work_size > 0) { + work_buffer.resize(pf32.work_size); + pf32.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf32); + ggml_graph_compute(&gf32, &pf32); } } printf("\n"); diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp index cdfe4bfe97865..7438defdefcdf 100644 --- a/examples/metal/metal.cpp +++ b/examples/metal/metal.cpp @@ -35,10 +35,9 @@ int main(int argc, char ** argv) { struct ggml_context * ctx_eval = NULL; struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); - gf.n_threads = 1; // this allocates all Metal resources and memory buffers - auto * ctx_metal = ggml_metal_init(); + auto * ctx_metal = ggml_metal_init(1); const size_t max_size_data = ggml_get_max_tensor_size(ctx_data); const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval); diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 0345b8dc02748..11ffbe2e1e3a1 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3160,6 +3160,7 @@ int main(int argc, char ** argv) { printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx)); // ggml_print_tensor_objects(model.ctx); + // TODO: use std::vector intead of "new" size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb); uint8_t * compute_addr = new uint8_t[compute_size]; @@ -3181,7 +3182,7 @@ int main(int argc, char ** argv) { GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); } - auto compute_plan_buffer = std::vector(); + std::vector work_buffer; printf("%s: begin training\n", __func__); @@ -3246,12 +3247,12 @@ int main(int argc, char ** argv) { } { - auto plan = ggml_graph_compute_make_plan(gf, params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(gf, params.n_threads); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, gf); + ggml_graph_compute(gf, &pf); } size_t used_mem_before_opt = ggml_used_mem(ctx0); @@ -3277,12 +3278,12 @@ int main(int argc, char ** argv) { model.train_tokens += n_batch * n_tokens; { - auto plan = ggml_graph_compute_make_plan(gf, params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(gf, params.n_threads); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, gf); + ggml_graph_compute(gf, &pf); } float error_after_opt = ggml_get_f32_1d(loss, 0); @@ -3372,12 +3373,12 @@ int main(int argc, char ** argv) { ggml_build_forward_expand(&gf, logits); { - auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, params.n_threads); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); @@ -3404,6 +3405,7 @@ int main(int argc, char ** argv) { delete[] compute_addr; delete[] compute_buf_0; delete[] compute_buf_1; + llama_free(lctx); llama_free_model(lmodel); ggml_free(model.ctx); diff --git a/ggml-metal.h b/ggml-metal.h index b9e50ac745eb0..928f1705c381c 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -34,9 +34,13 @@ extern "C" { struct ggml_metal_context; -struct ggml_metal_context * ggml_metal_init(void); +// number of command buffers to use +struct ggml_metal_context * ggml_metal_init(int n_cb); void ggml_metal_free(struct ggml_metal_context * ctx); +// set the number of command buffers to use +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb); + // creates a mapping between a host memory buffer and a device memory buffer // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute // - the mapping is used during computation to determine the arguments of the compute kernels diff --git a/ggml-metal.m b/ggml-metal.m index fd69c41fe357d..3f15f791f9f65 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -25,6 +25,8 @@ }; struct ggml_metal_context { + int n_cb; + float * logits; id device; @@ -86,11 +88,12 @@ @interface GGMLMetalClass : NSObject @implementation GGMLMetalClass @end -struct ggml_metal_context * ggml_metal_init(void) { +struct ggml_metal_context * ggml_metal_init(int n_cb) { fprintf(stderr, "%s: allocating\n", __func__); struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + ctx->n_cb = n_cb; ctx->device = MTLCreateSystemDefaultDevice(); ctx->queue = [ctx->device newCommandQueue]; ctx->n_buffers = 0; @@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { free(ctx); } +void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) { + ctx->n_cb = n_cb; +} + // finds the Metal buffer that contains the tensor data on the GPU device // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the // Metal buffer based on the host memory pointer @@ -354,7 +361,7 @@ void ggml_metal_graph_compute( // create multiple command buffers and enqueue them // then, we encode the graph into the command buffers in parallel - const int n_cb = gf->n_threads; + const int n_cb = ctx->n_cb; NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb]; diff --git a/ggml.c b/ggml.c index 94a71070612cc..23938fc5f27b2 100644 --- a/ggml.c +++ b/ggml.c @@ -15942,7 +15942,7 @@ void clear_numa_thread_affinity(void) {} struct ggml_compute_state_shared { const struct ggml_cgraph * cgraph; - const struct ggml_graph_compute_plan * plan; + const struct ggml_cplan * cplan; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; @@ -15971,12 +15971,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; - const struct ggml_graph_compute_plan * plan = state->shared->plan; - const int * n_tasks_arr = plan->n_tasks; + const int * n_tasks_arr = cplan->n_tasks; + const int n_threads = state->shared->n_threads; - const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); int node_n = -1; @@ -15989,8 +15990,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ plan->work_size, - /*.wdata =*/ plan->work_data, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; if (node_n != -1) { @@ -16059,8 +16060,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, /*.nth =*/ n_tasks, - /*.wsize =*/ plan->work_size, - /*.wdata =*/ plan->work_data, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, }; if (state->ith < n_tasks) { @@ -16072,14 +16073,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } // Prepare for graph computing. -struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, int n_threads) { +struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; } - struct ggml_graph_compute_plan plan; - memset(&plan, 0, sizeof(struct ggml_graph_compute_plan)); - int * n_tasks = plan.n_tasks; + struct ggml_cplan cplan; + memset(&cplan, 0, sizeof(struct ggml_cplan)); + + int * n_tasks = cplan.n_tasks; + size_t work_size = 0; // initialize tasks + work buffer @@ -16403,34 +16406,34 @@ struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * work_size += CACHE_LINE_SIZE*(n_threads - 1); } - plan.n_threads = n_threads; - plan.work_size = work_size; - plan.work_data = NULL; + cplan.n_threads = n_threads; + cplan.work_size = work_size; + cplan.work_data = NULL; - return plan; + return cplan; } -void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph) { +void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { { - GGML_ASSERT(plan); - GGML_ASSERT(plan->n_threads > 0); + GGML_ASSERT(cplan); + GGML_ASSERT(cplan->n_threads > 0); - if (plan->work_size > 0) { - GGML_ASSERT(plan->work_data); + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); } for (int i = 0; i < cgraph->n_nodes; ++i) { if (cgraph->nodes[i]->op != GGML_OP_NONE) { - GGML_ASSERT(plan->n_tasks[i] > 0); + GGML_ASSERT(cplan->n_tasks[i] > 0); } } } - const int n_threads = plan->n_threads; + const int n_threads = cplan->n_threads; struct ggml_compute_state_shared state_shared = { /*.cgraph =*/ cgraph, - /*.cgraph_plan =*/ plan, + /*.cgraph_plan =*/ cplan, /*.perf_node_start_cycles =*/ 0, /*.perf_node_start_time_us =*/ 0, /*.n_threads =*/ n_threads, @@ -16491,17 +16494,19 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap } // TODO: avoid allocating memory frequently. -static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); +// TODO: make part of public API - use different name and put warning that it makes allocations +static void ggml_graph_compute_helper(struct ggml_cgraph * cgraph, int n_threads) { + struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); + + if (cplan.work_size > 0) { + cplan.work_data = malloc(cplan.work_size); + GGML_ASSERT(cplan.work_data); } - ggml_graph_compute(&plan, cgraph); + ggml_graph_compute(cgraph, &cplan); - if (plan.work_data) { - free(plan.work_data); + if (cplan.work_data) { + free(cplan.work_data); } } @@ -17341,7 +17346,7 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_sugar(gb, params.n_threads); + ggml_graph_compute_helper(gb, params.n_threads); opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -17422,7 +17427,7 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_sugar(gb, params.n_threads); + ggml_graph_compute_helper(gb, params.n_threads); const float fx = ggml_get_f32_1d(f, 0); @@ -17544,7 +17549,7 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_sugar(gb, params->n_threads); + ggml_graph_compute_helper(gb, params->n_threads); ggml_opt_get_grad(np, ps, g); @@ -17664,7 +17669,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_sugar(gb, params.n_threads); + ggml_graph_compute_helper(gb, params.n_threads); ggml_opt_get_grad(np, ps, g); diff --git a/ggml.h b/ggml.h index 1b50ab8666ed6..901c701ea866f 100644 --- a/ggml.h +++ b/ggml.h @@ -443,17 +443,15 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); - // The default graph compute plan that needs to be prepared for ggml_graph_compute(). - // Since https://github.com/ggerganov/ggml/issues/287 - struct ggml_graph_compute_plan { - // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`. - size_t work_size; - // Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`. - uint8_t * work_data; + // the compute plan that needs to be prepared for ggml_graph_compute() + // since https://github.com/ggerganov/ggml/issues/287 + struct ggml_cplan { + size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()` + uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` int n_threads; - // The `n_tasks` of nodes, 1:1 mapping to cgraph nodes. + // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes int n_tasks[GGML_MAX_NODES]; }; @@ -1313,11 +1311,11 @@ extern "C" { GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); - // ggml_graph_compute_make_plan() needs to be called before ggml_graph_compute(). - // Returns a plan object. When plan.work_size > 0, caller must allocate memory for plan.work_data. - GGML_API struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, const int n_threads/*=GGML_DEFAULT_N_THREADS*/); - GGML_API void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph); - GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + // ggml_graph_plan() has to be called before ggml_graph_compute() + // when plan.work_size > 0, caller must allocate memory for plan.work_data + GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); diff --git a/llama.cpp b/llama.cpp index c29d46d8dd596..e68beb7c5b8b4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -321,9 +321,8 @@ struct llama_context { // input embedding (1-dimensional array: [n_embd]) std::vector embedding; - // reusable buffer for `struct ggml_graph_compute_plan.work_data` - // std::vector guarantees the elements are stored contiguously. - std::vector compute_plan_buffer; + // reusable buffer for `struct ggml_graph_plan.work_data` + std::vector work_buffer; // memory buffers used to evaluate the model // TODO: move in llama_state @@ -1599,6 +1598,7 @@ static bool llama_eval_internal( #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { + ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); call_ggml_graph_compute = false; @@ -1622,12 +1622,12 @@ static bool llama_eval_internal( #endif if (call_ggml_graph_compute) { - auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads); - if (plan.work_size > 0) { - lctx.compute_plan_buffer.resize(plan.work_size); - plan.work_data = lctx.compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, actual_n_threads); + if (pf.work_size > 0) { + lctx.work_buffer.resize(pf.work_size); + pf.work_data = lctx.work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } if (cgraph_fname) { @@ -2587,8 +2587,8 @@ void llama_free_model(struct llama_model * model) { } struct llama_context * llama_new_context_with_model( - struct llama_model * model, - struct llama_context_params params) { + struct llama_model * model, + struct llama_context_params params) { if (!model) { return nullptr; @@ -2657,7 +2657,7 @@ struct llama_context * llama_new_context_with_model( #ifdef GGML_USE_METAL if (params.n_gpu_layers > 0) { // this allocates all Metal resources and memory buffers - ctx->ctx_metal = ggml_metal_init(); + ctx->ctx_metal = ggml_metal_init(1); void * data_ptr = NULL; size_t data_size = 0; @@ -2815,7 +2815,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const bool warned = false; int n_tensors = 0; - auto compute_plan_buffer = std::vector(); + std::vector work_buffer; while (true) { int32_t n_dims; @@ -2983,12 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const struct ggml_cgraph gf = ggml_build_forward(r); { - auto plan = ggml_graph_compute_make_plan(&gf, n_threads); - if (plan.work_size > 0) { - compute_plan_buffer.resize(plan.work_size); - plan.work_data = compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, n_threads); + if (pf.work_size > 0) { + work_buffer.resize(pf.work_size); + pf.work_data = work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } // we won't need these tensors again, reset the context to save memory @@ -3163,12 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); { - auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); - if (plan.work_size > 0) { - ctx->compute_plan_buffer.resize(plan.work_size); - plan.work_data = ctx->compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); + if (pf.work_size > 0) { + ctx->work_buffer.resize(pf.work_size); + pf.work_data = ctx->work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } ggml_free(cpy_ctx); @@ -3276,12 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); { - auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1); - if (plan.work_size > 0) { - ctx->compute_plan_buffer.resize(plan.work_size); - plan.work_data = ctx->compute_plan_buffer.data(); + ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); + if (pf.work_size > 0) { + ctx->work_buffer.resize(pf.work_size); + pf.work_data = ctx->work_buffer.data(); } - ggml_graph_compute(&plan, &gf); + ggml_graph_compute(&gf, &pf); } ggml_free(cpy_ctx); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4171c126c7b7d..dd989c5c041f7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp) llama_add_test(test-quantize-perf.cpp) llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) -# llama_add_test(test-grad0.c) # SLOW -# llama_add_test(test-opt.c) # SLOW +llama_add_test(test-grad0.c) # SLOW +llama_add_test(test-opt.c) # SLOW diff --git a/tests/test-grad0.c b/tests/test-grad0.c index 548547727efdc..9c27e603e6bee 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -10,6 +10,8 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +#pragma GCC diagnostic ignored "-Wdouble-promotion" + #define MAX_NARGS 3 #undef MIN @@ -49,7 +51,7 @@ float frand(void) { int irand(int n) { if (n == 0) return 0; - else return rand()%n; + return rand()%n; } void get_random_dims(int64_t * dims, int ndims) { @@ -159,12 +161,14 @@ struct ggml_tensor * get_random_tensor_int( float get_element(const struct ggml_tensor * t, int idx) { if (t->type == GGML_TYPE_F32) { return ((float *)t->data)[idx]; - } else if (t->type == GGML_TYPE_I32) { + } + + if (t->type == GGML_TYPE_I32) { return ((int32_t *)t->data)[idx]; - } else { - assert(false); - return INFINITY; } + + assert(false); + return INFINITY; } void set_element(struct ggml_tensor * t, int idx, float value) { @@ -191,12 +195,12 @@ void print_elements(const char* label, const struct ggml_tensor * t) { } -struct compute_plan_buffer { +struct work_buffer { size_t size; uint8_t * data; }; -static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) { +static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) { if (size == 0) { return NULL; } @@ -241,20 +245,19 @@ bool check_gradient( } struct ggml_cgraph gf = ggml_build_forward (f); - struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); - struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL }; + struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL }; { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); - if (plan.work_size > 0) { - plan.work_data = malloc(plan.work_size); - GGML_ASSERT(plan.work_data); + struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads); + if (pf.work_size > 0) { + pf.work_data = malloc(pf.work_size); + GGML_ASSERT(pf.work_data); } - ggml_graph_compute(&plan, &gf); - if (plan.work_data) { - free(plan.work_data); + ggml_graph_compute(&gf, &pf); + if (pf.work_data) { + free(pf.work_data); } } @@ -262,9 +265,9 @@ bool check_gradient( ggml_set_f32 (f->grad, 1.0f); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &gb); + struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads); + pf.work_data = work_buffer_resize(&buf, pf.work_size); + ggml_graph_compute(&gf, &pf); } // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); @@ -280,9 +283,9 @@ bool check_gradient( set_element(x[i], k, xp); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &gf); + struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads); + pf.work_data = work_buffer_resize(&buf, pf.work_size); + ggml_graph_compute(&gf, &pf); } const float f0 = ggml_get_f32_1d(f, 0); @@ -290,9 +293,9 @@ bool check_gradient( set_element(x[i], k, xm); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &gf); + struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads); + pf.work_data = work_buffer_resize(&buf, pf.work_size); + ggml_graph_compute(&gf, &pf); } const float f1 = ggml_get_f32_1d(f, 0); @@ -306,15 +309,15 @@ bool check_gradient( ggml_set_f32 (f->grad, 1.0f); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &gb); + struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads); + pf.work_data = work_buffer_resize(&buf, pf.work_size); + ggml_graph_compute(&gf, &pf); } const float g1 = get_element(x[i]->grad, k); const float error_abs = fabsf(g0 - g1); - const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0; + const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0; if (error_abs > max_error_abs || error_rel > max_error_rel) { printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n", @@ -325,8 +328,8 @@ bool check_gradient( } } - if (plan_buf.data) { - free(plan_buf.data); + if (buf.data) { + free(buf.data); } return true; diff --git a/tests/test-opt.c b/tests/test-opt.c index 35d070dc7a095..3ed246b3b3b65 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.c @@ -7,6 +7,7 @@ #define MAX_NARGS 2 +#pragma GCC diagnostic ignored "-Wdouble-promotion" // // logging @@ -33,7 +34,7 @@ #define GGML_PRINT(...) printf(__VA_ARGS__) -float frand() { +float frand(void) { return (float)rand()/(float)RAND_MAX; } @@ -115,12 +116,12 @@ void set_element(struct ggml_tensor * t, int idx, float value) { } -struct compute_plan_buffer { +struct work_buffer { size_t size; uint8_t * data; }; -static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) { +static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) { if (size == 0) { return NULL; } @@ -139,7 +140,7 @@ static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t s return buf->data; } -int main(int argc, const char ** argv) { +int main(void) { struct ggml_init_params params = { .mem_size = 1024*1024*1024, .mem_buffer = NULL, @@ -166,11 +167,11 @@ int main(int argc, const char ** argv) { struct ggml_cgraph ge = ggml_build_forward(e); ggml_graph_reset (&ge); - struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL }; + struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL }; { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &ge); + struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1); + pe.work_data = work_buffer_resize(&buf, pe.work_size); + ggml_graph_compute(&ge, &pe); } const float fe = ggml_get_f32_1d(e, 0); @@ -183,13 +184,13 @@ int main(int argc, const char ** argv) { ggml_graph_reset (&ge); { - struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1); - plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size); - ggml_graph_compute(&plan, &ge); + struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1); + pe.work_data = work_buffer_resize(&buf, pe.work_size); + ggml_graph_compute(&ge, &pe); } - if (plan_buf.data) { - free(plan_buf.data); + if (buf.data) { + free(buf.data); } const float fe_opt = ggml_get_f32_1d(e, 0); From 4646cc2cf16bdece6ba87b6444fe8b02e87f1c5b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 20:25:27 +0300 Subject: [PATCH 09/20] ggml : fix docs --- ggml.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/ggml.h b/ggml.h index 901c701ea866f..78870147c6857 100644 --- a/ggml.h +++ b/ggml.h @@ -65,15 +65,17 @@ // ggml_set_f32(a, 3.0f); // ggml_set_f32(b, 4.0f); // -// const int n_threads = 1; -// struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads); -// if (plan.work_size > 0) { -// plan.work_data = malloc(plan.work_size); -// GGML_ASSERT(plan.work_data); +// struct ggml_cplan pf = ggml_graph_compute_make_plan(&gf, n_threads); +// +// if (pf.work_size > 0) { +// pf.work_data = malloc(pf.work_size); +// GGML_ASSERT(pf.work_data); // } -// ggml_graph_compute(&plan, &gf); -// if (plan.work_data) { -// free(plan.work_data); +// +// ggml_graph_compute(&gf, &pf); +// +// if (pf.work_data) { +// free(pf.work_data); // } // // printf("f = %f\n", ggml_get_f32_1d(f, 0)); From 8e1f0b6865f11a52932278775b040a8c925e247e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 20:30:40 +0300 Subject: [PATCH 10/20] tests : disable grad / opt + minor naming changes --- llama.cpp | 9 +++++---- tests/CMakeLists.txt | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index e68beb7c5b8b4..5c9aea9de24fc 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1268,7 +1268,7 @@ static bool llama_eval_internal( const float * embd, const int n_tokens, const int n_past, - const int n_threads, + int n_threads, const char * cgraph_fname) { LLAMA_ASSERT((!tokens && embd) || (tokens && !embd)); @@ -1309,10 +1309,11 @@ static bool llama_eval_internal( struct ggml_context * ctx0 = ggml_init(params); + ggml_cgraph gf = {}; + // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance - ggml_cgraph gf = {}; - const int actual_n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; + n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -1622,7 +1623,7 @@ static bool llama_eval_internal( #endif if (call_ggml_graph_compute) { - ggml_cplan pf = ggml_graph_plan(&gf, actual_n_threads); + ggml_cplan pf = ggml_graph_plan(&gf, n_threads); if (pf.work_size > 0) { lctx.work_buffer.resize(pf.work_size); pf.work_data = lctx.work_buffer.data(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index dd989c5c041f7..4171c126c7b7d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp) llama_add_test(test-quantize-perf.cpp) llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) -llama_add_test(test-grad0.c) # SLOW -llama_add_test(test-opt.c) # SLOW +# llama_add_test(test-grad0.c) # SLOW +# llama_add_test(test-opt.c) # SLOW From 2392f7a9cd732032cf6662e7ce3bdef6115826b1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 20:43:43 +0300 Subject: [PATCH 11/20] ggml : add ggml_graph_compute_with_ctx() - backwards compatible API - deduplicates a lot of copy-paste --- ggml.c | 32 ++++++++++----------- ggml.h | 6 +++- tests/test-grad0.c | 69 ++++------------------------------------------ tests/test-opt.c | 47 +++---------------------------- 4 files changed, 29 insertions(+), 125 deletions(-) diff --git a/ggml.c b/ggml.c index 23938fc5f27b2..f8eddd81695e8 100644 --- a/ggml.c +++ b/ggml.c @@ -16493,21 +16493,17 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) } } -// TODO: avoid allocating memory frequently. -// TODO: make part of public API - use different name and put warning that it makes allocations -static void ggml_graph_compute_helper(struct ggml_cgraph * cgraph, int n_threads) { +// same as ggml_graph_compute() but the work data is allocated as a part of the context +// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data +void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); - if (cplan.work_size > 0) { - cplan.work_data = malloc(cplan.work_size); - GGML_ASSERT(cplan.work_data); - } + struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size); + GGML_ASSERT(buf); - ggml_graph_compute(cgraph, &cplan); + cplan.work_data = buf->data; - if (cplan.work_data) { - free(cplan.work_data); - } + ggml_graph_compute(cgraph, &cplan); } void ggml_graph_reset(struct ggml_cgraph * cgraph) { @@ -17292,6 +17288,7 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g // static enum ggml_opt_result ggml_opt_adam( + struct ggml_context * ctx, struct ggml_opt_context * opt, struct ggml_opt_params params, struct ggml_tensor * f, @@ -17346,7 +17343,7 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_helper(gb, params.n_threads); + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); opt->adam.fx_prev = ggml_get_f32_1d(f, 0); opt->adam.fx_best = opt->adam.fx_prev; @@ -17427,7 +17424,7 @@ static enum ggml_opt_result ggml_opt_adam( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_helper(gb, params.n_threads); + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); const float fx = ggml_get_f32_1d(f, 0); @@ -17498,6 +17495,7 @@ struct ggml_lbfgs_iteration_data { }; static enum ggml_opt_result linesearch_backtracking( + struct ggml_context * ctx, const struct ggml_opt_params * params, int nx, float * x, @@ -17549,7 +17547,7 @@ static enum ggml_opt_result linesearch_backtracking( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_helper(gb, params->n_threads); + ggml_graph_compute_with_ctx(ctx, gb, params->n_threads); ggml_opt_get_grad(np, ps, g); @@ -17669,7 +17667,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_graph_reset (gf); ggml_set_f32 (f->grad, 1.0f); - ggml_graph_compute_helper(gb, params.n_threads); + ggml_graph_compute_with_ctx(ctx, gb, params.n_threads); ggml_opt_get_grad(np, ps, g); @@ -17728,7 +17726,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_vec_cpy_f32(nx, xp, x); ggml_vec_cpy_f32(nx, gp, g); - ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps); + ls = linesearch_backtracking(ctx, ¶ms, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps); if (ls < 0) { // linesearch failed - go back to the previous point and return @@ -18030,7 +18028,7 @@ enum ggml_opt_result ggml_opt_resume_g( switch (opt->params.type) { case GGML_OPT_ADAM: { - result = ggml_opt_adam(opt, opt->params, f, gf, gb); + result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb); } break; case GGML_OPT_LBFGS: { diff --git a/ggml.h b/ggml.h index 78870147c6857..906045c9e1de1 100644 --- a/ggml.h +++ b/ggml.h @@ -1306,7 +1306,7 @@ extern "C" { GGML_API void ggml_set_param( struct ggml_context * ctx, - struct ggml_tensor * tensor); + struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); @@ -1319,6 +1319,10 @@ extern "C" { GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); + // same as ggml_graph_compute() but the work data is allocated as a part of the context + // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data + GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); + GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); diff --git a/tests/test-grad0.c b/tests/test-grad0.c index 9c27e603e6bee..da4001ce5269f 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -195,32 +195,6 @@ void print_elements(const char* label, const struct ggml_tensor * t) { } -struct work_buffer { - size_t size; - uint8_t * data; -}; - -static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) { - if (size == 0) { - return NULL; - } - - GGML_ASSERT(buf); - - if (buf->size == 0) { - buf->data = malloc(size); - buf->size = size; - } else if (buf->size < size) { - buf->data = realloc(buf->data, size); - buf->size = size; - } else { - // skip shrinking. - } - - GGML_ASSERT(buf->data); - return buf->data; -} - bool check_gradient( const char * op_name, struct ggml_context * ctx0, @@ -247,28 +221,12 @@ bool check_gradient( struct ggml_cgraph gf = ggml_build_forward (f); struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); - struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL }; - - { - struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads); - if (pf.work_size > 0) { - pf.work_data = malloc(pf.work_size); - GGML_ASSERT(pf.work_data); - } - ggml_graph_compute(&gf, &pf); - if (pf.work_data) { - free(pf.work_data); - } - } + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); ggml_graph_reset (&gf); ggml_set_f32 (f->grad, 1.0f); - { - struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads); - pf.work_data = work_buffer_resize(&buf, pf.work_size); - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot"); // ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot"); @@ -282,24 +240,15 @@ bool check_gradient( const float xp = x0 + eps; set_element(x[i], k, xp); - { - struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads); - pf.work_data = work_buffer_resize(&buf, pf.work_size); - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); const float f0 = ggml_get_f32_1d(f, 0); set_element(x[i], k, xm); - { - struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads); - pf.work_data = work_buffer_resize(&buf, pf.work_size); - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_with_ctx(ctx0, &gf, n_threads); const float f1 = ggml_get_f32_1d(f, 0); - const float g0 = (f0 - f1)/(2.0f*eps); set_element(x[i], k, x0); @@ -308,11 +257,7 @@ bool check_gradient( ggml_graph_reset (&gf); ggml_set_f32 (f->grad, 1.0f); - { - struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads); - pf.work_data = work_buffer_resize(&buf, pf.work_size); - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_with_ctx(ctx0, &gb, n_threads); const float g1 = get_element(x[i]->grad, k); @@ -328,10 +273,6 @@ bool check_gradient( } } - if (buf.data) { - free(buf.data); - } - return true; } diff --git a/tests/test-opt.c b/tests/test-opt.c index 3ed246b3b3b65..e928a7df7ee68 100644 --- a/tests/test-opt.c +++ b/tests/test-opt.c @@ -115,31 +115,6 @@ void set_element(struct ggml_tensor * t, int idx, float value) { ((float *)t->data)[idx] = value; } - -struct work_buffer { - size_t size; - uint8_t * data; -}; - -static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) { - if (size == 0) { - return NULL; - } - - if (buf->size == 0) { - buf->data = malloc(size); - buf->size = size; - } else if (buf->size < size) { - buf->data = realloc(buf->data, size); - buf->size = size; - } else { - // skip shrinking. - } - - GGML_ASSERT(buf->data); - return buf->data; -} - int main(void) { struct ggml_init_params params = { .mem_size = 1024*1024*1024, @@ -163,16 +138,10 @@ int main(void) { struct ggml_tensor * d = ggml_sub(ctx, c, ab); struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d)); - struct ggml_cgraph ge = ggml_build_forward(e); - ggml_graph_reset (&ge); + ggml_graph_reset(&ge); - struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL }; - { - struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1); - pe.work_data = work_buffer_resize(&buf, pe.work_size); - ggml_graph_compute(&ge, &pe); - } + ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1); const float fe = ggml_get_f32_1d(e, 0); printf("%s: e = %.4f\n", __func__, fe); @@ -181,17 +150,9 @@ int main(void) { ggml_opt(ctx, opt_params, e); - ggml_graph_reset (&ge); + ggml_graph_reset(&ge); - { - struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1); - pe.work_data = work_buffer_resize(&buf, pe.work_size); - ggml_graph_compute(&ge, &pe); - } - - if (buf.data) { - free(buf.data); - } + ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1); const float fe_opt = ggml_get_f32_1d(e, 0); printf("%s: original e = %.4f\n", __func__, fe); From 1b9994f8098b3bb49e82672ccec40a704769d07f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 20:57:12 +0300 Subject: [PATCH 12/20] ci : enable test-grad0 --- .github/workflows/build.yml | 25 +++++++++++++++++++++---- tests/CMakeLists.txt | 2 +- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 12481e8be7cf7..547b03a7a7772 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -41,6 +41,10 @@ jobs: ubuntu-latest-cmake: runs-on: ubuntu-latest + env: + GGML_NLOOP: 3 + GGML_NITER: 1 + steps: - name: Clone id: checkout @@ -64,11 +68,15 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ctest --verbose --timeout 900 ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest + env: + GGML_NLOOP: 3 + GGML_NITER: 1 + continue-on-error: true strategy: @@ -99,7 +107,7 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ctest --verbose --timeout 900 macOS-latest-make: runs-on: macos-latest @@ -123,6 +131,10 @@ jobs: macOS-latest-cmake: runs-on: macos-latest + env: + GGML_NLOOP: 3 + GGML_NITER: 1 + steps: - name: Clone id: checkout @@ -147,10 +159,15 @@ jobs: id: cmake_test run: | cd build - ctest --verbose + ctest --verbose --timeout 900 windows-latest-cmake: runs-on: windows-latest + + env: + GGML_NLOOP: 3 + GGML_NITER: 1 + env: OPENBLAS_VERSION: 0.3.23 OPENCL_VERSION: 2023.04.17 @@ -249,7 +266,7 @@ jobs: if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible run: | cd build - ctest -C Release --verbose + ctest -C Release --verbose --timeout 900 - name: Get commit hash id: commit diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 4171c126c7b7d..1acf050a743e4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp) llama_add_test(test-quantize-perf.cpp) llama_add_test(test-sampling.cpp) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) -# llama_add_test(test-grad0.c) # SLOW +llama_add_test(test-grad0.c) # SLOW # llama_add_test(test-opt.c) # SLOW From a67404e7497445b8f63c750ec6f285304b7b13ee Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 21:08:25 +0300 Subject: [PATCH 13/20] examples : factor out plan allocation into a helper function --- examples/baby-llama/baby-llama.cpp | 44 +++++++------------ .../train-text-from-scratch.cpp | 39 ++++++---------- ggml.h | 13 +----- 3 files changed, 31 insertions(+), 65 deletions(-) diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 5d66089b1e22e..4965881ecec22 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) { return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r); } +void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + struct ggml_tensor * randomize_tensor( struct ggml_tensor * tensor, int ndims, @@ -1596,15 +1607,7 @@ int main(int argc, char ** argv) { struct ggml_tensor * e = square_error_loss(ctx0, targets, logits); ggml_build_forward_expand(&gf, e); - - { - struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); - if (pf.work_size > 0) { - work_buffer.resize(pf.work_size); - pf.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); float error_before_opt = ggml_get_f32_1d(e, 0); @@ -1620,15 +1623,7 @@ int main(int argc, char ** argv) { ggml_opt(ctx0, opt_params_lbfgs, e); // ggml_build_forward_expand(&gf, e); - - { - struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); - if (pf.work_size > 0) { - work_buffer.resize(pf.work_size); - pf.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); float error_after_opt = ggml_get_f32_1d(e, 0); @@ -1681,15 +1676,7 @@ int main(int argc, char ** argv) { struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past); ggml_build_forward_expand(&gf, logits); - - { - struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); - if (pf.work_size > 0) { - work_buffer.resize(pf.work_size); - pf.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1); struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); @@ -1711,10 +1698,11 @@ int main(int argc, char ** argv) { } print_matrix(model.tok_embeddings); - printf("done\n"); + // ggml_free(kv_self.ctx); // ggml_free(model_lora.ctx); ggml_free(model.ctx); + return 0; } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 11ffbe2e1e3a1..b96fdcdc44b57 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) { return rnd->rd(rnd->gen); } +void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier switch (tensor->n_dims) { @@ -3246,14 +3257,7 @@ int main(int argc, char ** argv) { *gb = ggml_build_backward(ctx0, gf, true); } - { - ggml_cplan pf = ggml_graph_plan(gf, params.n_threads); - if (pf.work_size > 0) { - work_buffer.resize(pf.work_size); - pf.work_data = work_buffer.data(); - } - ggml_graph_compute(gf, &pf); - } + ggml_graph_compute_helper(work_buffer, gf, params.n_threads); size_t used_mem_before_opt = ggml_used_mem(ctx0); @@ -3277,14 +3281,7 @@ int main(int argc, char ** argv) { model.train_samples += n_batch; model.train_tokens += n_batch * n_tokens; - { - ggml_cplan pf = ggml_graph_plan(gf, params.n_threads); - if (pf.work_size > 0) { - work_buffer.resize(pf.work_size); - pf.work_data = work_buffer.data(); - } - ggml_graph_compute(gf, &pf); - } + ggml_graph_compute_helper(work_buffer, gf, params.n_threads); float error_after_opt = ggml_get_f32_1d(loss, 0); @@ -3371,15 +3368,7 @@ int main(int argc, char ** argv) { struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past); ggml_build_forward_expand(&gf, logits); - - { - ggml_cplan pf = ggml_graph_plan(&gf, params.n_threads); - if (pf.work_size > 0) { - work_buffer.resize(pf.work_size); - pf.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(work_buffer, &gf, params.n_threads); //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx); //struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx); diff --git a/ggml.h b/ggml.h index 906045c9e1de1..d6cde970de53b 100644 --- a/ggml.h +++ b/ggml.h @@ -65,18 +65,7 @@ // ggml_set_f32(a, 3.0f); // ggml_set_f32(b, 4.0f); // -// struct ggml_cplan pf = ggml_graph_compute_make_plan(&gf, n_threads); -// -// if (pf.work_size > 0) { -// pf.work_data = malloc(pf.work_size); -// GGML_ASSERT(pf.work_data); -// } -// -// ggml_graph_compute(&gf, &pf); -// -// if (pf.work_data) { -// free(pf.work_data); -// } +// ggml_graph_compute_with_ctx(ctx, &gf, n_threads); // // printf("f = %f\n", ggml_get_f32_1d(f, 0)); // From 2d3a5252f9f08616430a2c473ca7ffa784cca46b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 21:12:25 +0300 Subject: [PATCH 14/20] llama : factor out plan stuff into a helper function --- llama.cpp | 56 +++++++++++++++++++++++-------------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/llama.cpp b/llama.cpp index 5c9aea9de24fc..0aecbeedce92a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -79,6 +79,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default (void) tensor; } +// +// ggml helpers +// + +void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + +// +// memory sizes +// + static const std::map & MEM_REQ_SCRATCH0() { static std::map k_sizes = { @@ -761,7 +780,6 @@ struct llama_model_loader { }; - // // kv cache // @@ -1623,12 +1641,7 @@ static bool llama_eval_internal( #endif if (call_ggml_graph_compute) { - ggml_cplan pf = ggml_graph_plan(&gf, n_threads); - if (pf.work_size > 0) { - lctx.work_buffer.resize(pf.work_size); - pf.work_data = lctx.work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); + ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); } if (cgraph_fname) { @@ -2983,14 +2996,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const struct ggml_cgraph gf = ggml_build_forward(r); - { - ggml_cplan pf = ggml_graph_plan(&gf, n_threads); - if (pf.work_size > 0) { - work_buffer.resize(pf.work_size); - pf.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(work_buffer, &gf, n_threads); // we won't need these tensors again, reset the context to save memory ggml_free(lora_ctx); @@ -3162,15 +3168,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d)); - - { - ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); - if (pf.work_size > 0) { - ctx->work_buffer.resize(pf.work_size); - pf.work_data = ctx->work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } @@ -3275,15 +3273,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d)); ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d)); - - { - ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1); - if (pf.work_size > 0) { - ctx->work_buffer.resize(pf.work_size); - pf.work_data = ctx->work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1); ggml_free(cpy_ctx); } From 8fdf86dd253aef32a7e4acd97bde2150e0f3c40a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 21:15:17 +0300 Subject: [PATCH 15/20] ci : fix env --- .github/workflows/build.yml | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 547b03a7a7772..a576139efd0ee 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -16,7 +16,9 @@ on: paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu'] env: - BRANCH_NAME: ${{ github.head_ref || github.ref_name }} + BRANCH_NAME: ${{ github.head_ref || github.ref_name }} + GGML_NLOOP: 3 + GGML_NITER: 1 jobs: ubuntu-focal-make: @@ -41,10 +43,6 @@ jobs: ubuntu-latest-cmake: runs-on: ubuntu-latest - env: - GGML_NLOOP: 3 - GGML_NITER: 1 - steps: - name: Clone id: checkout @@ -73,10 +71,6 @@ jobs: ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest - env: - GGML_NLOOP: 3 - GGML_NITER: 1 - continue-on-error: true strategy: @@ -131,10 +125,6 @@ jobs: macOS-latest-cmake: runs-on: macos-latest - env: - GGML_NLOOP: 3 - GGML_NITER: 1 - steps: - name: Clone id: checkout @@ -164,10 +154,6 @@ jobs: windows-latest-cmake: runs-on: windows-latest - env: - GGML_NLOOP: 3 - GGML_NITER: 1 - env: OPENBLAS_VERSION: 0.3.23 OPENCL_VERSION: 2023.04.17 From 9c9bdaf0b8e9e3d04c0caa83a7722a14b629e475 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 21:18:42 +0300 Subject: [PATCH 16/20] llama : fix duplicate symbols + refactor example benchmark --- examples/benchmark/benchmark-matmult.cpp | 38 +++++++++--------------- llama.cpp | 2 +- 2 files changed, 15 insertions(+), 25 deletions(-) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 840f4fe525cfb..f7215f43bb31c 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -20,6 +20,17 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { + struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); + + if (plan.work_size > 0) { + buf.resize(plan.work_size); + plan.work_data = buf.data(); + } + + ggml_graph_compute(graph, &plan); +} + float tensor_sum_elements(const ggml_tensor * tensor) { float sum = 0; if (tensor->type==GGML_TYPE_F32) { @@ -166,14 +177,7 @@ int main(int argc, char ** argv) { std::vector work_buffer; - { - ggml_cplan pf = ggml_graph_plan(&gf, benchmark_params.n_threads); - if (pf.work_size > 0) { - work_buffer.resize(pf.work_size); - pf.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf, &pf); - } + ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads); TENSOR_DUMP(gf.nodes[0]); @@ -227,14 +231,7 @@ int main(int argc, char ** argv) { long long int start = ggml_time_us(); //printf("Running ggml_graph_compute\n"); - { - ggml_cplan pf31 = ggml_graph_plan(&gf31, benchmark_params.n_threads); - if (pf31.work_size > 0) { - work_buffer.resize(pf31.work_size); - pf31.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf31, &pf31); - } + ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads); long long int stop = ggml_time_us(); long long int usec = stop-start; @@ -267,14 +264,7 @@ int main(int argc, char ** argv) { } // Running a different graph computation to make sure we override the CPU cache lines - { - ggml_cplan pf32 = ggml_graph_plan(&gf32, benchmark_params.n_threads); - if (pf32.work_size > 0) { - work_buffer.resize(pf32.work_size); - pf32.work_data = work_buffer.data(); - } - ggml_graph_compute(&gf32, &pf32); - } + ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads); } printf("\n"); printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); diff --git a/llama.cpp b/llama.cpp index 0aecbeedce92a..5221ab5a2dd27 100644 --- a/llama.cpp +++ b/llama.cpp @@ -83,7 +83,7 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default // ggml helpers // -void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { +static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); if (plan.work_size > 0) { From 8dc7f104f82c81f51175050edc91c642d33b8927 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 21:28:10 +0300 Subject: [PATCH 17/20] ggml : remove obsolete assert + refactor n_tasks section --- ggml.c | 549 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 273 insertions(+), 276 deletions(-) diff --git a/ggml.c b/ggml.c index f8eddd81695e8..27232af28e0e0 100644 --- a/ggml.c +++ b/ggml.c @@ -10717,8 +10717,6 @@ static void ggml_compute_forward_mul_mat( float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3)); - assert(ne00 % 32 == 0); - for (int64_t ic = 0; ic < ne11; ++ic) { vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size)); } @@ -16078,328 +16076,327 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { n_threads = GGML_DEFAULT_N_THREADS; } + size_t work_size = 0; + struct ggml_cplan cplan; memset(&cplan, 0, sizeof(struct ggml_cplan)); - int * n_tasks = cplan.n_tasks; + // thread scheduling for the different operations + work buffer size estimation + for (int i = 0; i < cgraph->n_nodes; i++) { + int n_tasks = 1; - size_t work_size = 0; + struct ggml_tensor * node = cgraph->nodes[i]; - // initialize tasks + work buffer - { - // thread scheduling for the different operations - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * node = cgraph->nodes[i]; - - switch (node->op) { - case GGML_OP_CPY: - case GGML_OP_DUP: - { - n_tasks[i] = n_threads; - - size_t cur = 0; - if (ggml_is_quantized(node->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks[i]; - } + switch (node->op) { + case GGML_OP_CPY: + case GGML_OP_DUP: + { + n_tasks = n_threads; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_ADD: - case GGML_OP_ADD1: - { - n_tasks[i] = n_threads; + size_t cur = 0; + if (ggml_is_quantized(node->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks; + } - size_t cur = 0; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ADD: + case GGML_OP_ADD1: + { + n_tasks = n_threads; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks[i]; - } + size_t cur = 0; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_ACC: - { - n_tasks[i] = n_threads; + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks; + } - size_t cur = 0; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_ACC: + { + n_tasks = n_threads; - if (ggml_is_quantized(node->src0->type)) { - cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks[i]; - } + size_t cur = 0; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_SUB: - case GGML_OP_DIV: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_LOG: - case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: - case GGML_OP_ARGMAX: - case GGML_OP_REPEAT: - case GGML_OP_REPEAT_BACK: - case GGML_OP_ABS: - case GGML_OP_SGN: - case GGML_OP_NEG: - case GGML_OP_STEP: - case GGML_OP_TANH: - case GGML_OP_ELU: - case GGML_OP_RELU: - { - n_tasks[i] = 1; - } break; - case GGML_OP_MUL: - case GGML_OP_GELU: - case GGML_OP_GELU_QUICK: - case GGML_OP_SILU: - case GGML_OP_SILU_BACK: - case GGML_OP_NORM: - case GGML_OP_RMS_NORM: - case GGML_OP_RMS_NORM_BACK: - { - n_tasks[i] = n_threads; - } break; - case GGML_OP_MUL_MAT: - case GGML_OP_OUT_PROD: - { - n_tasks[i] = n_threads; - - // TODO: use different scheduling for different matrix sizes - //const int nr0 = ggml_nrows(node->src0); - //const int nr1 = ggml_nrows(node->src1); - - //n_tasks[i] = MIN(n_threads, MAX(1, nr0/128)); - //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, n_tasks[i]); - - size_t cur = 0; - const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type; + if (ggml_is_quantized(node->src0->type)) { + cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SUB: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + case GGML_OP_ARGMAX: + case GGML_OP_REPEAT: + case GGML_OP_REPEAT_BACK: + case GGML_OP_ABS: + case GGML_OP_SGN: + case GGML_OP_NEG: + case GGML_OP_STEP: + case GGML_OP_TANH: + case GGML_OP_ELU: + case GGML_OP_RELU: + { + n_tasks = 1; + } break; + case GGML_OP_MUL: + case GGML_OP_GELU: + case GGML_OP_GELU_QUICK: + case GGML_OP_SILU: + case GGML_OP_SILU_BACK: + case GGML_OP_NORM: + case GGML_OP_RMS_NORM: + case GGML_OP_RMS_NORM_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_MUL_MAT: + case GGML_OP_OUT_PROD: + { + n_tasks = n_threads; + + // TODO: use different scheduling for different matrix sizes + //const int nr0 = ggml_nrows(node->src0); + //const int nr1 = ggml_nrows(node->src1); + + //n_tasks = MIN(n_threads, MAX(1, nr0/128)); + //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); + + size_t cur = 0; + const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type; #if defined(GGML_USE_CUBLAS) - if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { - n_tasks[i] = 1; // TODO: this actually is doing nothing - // the threads are still spinning - } - else + if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + } + else #elif defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { - n_tasks[i] = 1; // TODO: this actually is doing nothing - // the threads are still spinning + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); } else #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - n_tasks[i] = 1; // TODO: this actually is doing nothing - // the threads are still spinning - if (node->src0->type != GGML_TYPE_F32) { - // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } - } else + if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + if (node->src0->type != GGML_TYPE_F32) { + // here we need memory just for single 2D matrix from src0 + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + } + } else #endif - if (node->src1->type != vec_dot_type) { - cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type]; - } else { - cur = 0; - } + if (node->src1->type != vec_dot_type) { + cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type]; + } else { + cur = 0; + } + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_SCALE: + { + n_tasks = 1; + } break; + case GGML_OP_SET: + case GGML_OP_CONT: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS_BACK: + case GGML_OP_DIAG: + case GGML_OP_DIAG_MASK_ZERO: + { + n_tasks = 1; + } break; + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_SOFT_MAX_BACK: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + { + n_tasks = n_threads; + } break; + case GGML_OP_ALIBI: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CLAMP: + { + n_tasks = 1; //TODO + } break; + case GGML_OP_CONV_1D: + { + n_tasks = n_threads; + + GGML_ASSERT(node->src0->ne[3] == 1); + GGML_ASSERT(node->src1->ne[2] == 1); + GGML_ASSERT(node->src1->ne[3] == 1); + + size_t cur = 0; + const int nk = node->src0->ne[0]; - work_size = MAX(work_size, cur); - } break; - case GGML_OP_SCALE: - { - n_tasks[i] = 1; - } break; - case GGML_OP_SET: - case GGML_OP_CONT: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_PERMUTE: - case GGML_OP_TRANSPOSE: - case GGML_OP_GET_ROWS: - case GGML_OP_GET_ROWS_BACK: - case GGML_OP_DIAG: - case GGML_OP_DIAG_MASK_ZERO: - { - n_tasks[i] = 1; - } break; - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_SOFT_MAX: - case GGML_OP_SOFT_MAX_BACK: - case GGML_OP_ROPE: - case GGML_OP_ROPE_BACK: - { - n_tasks[i] = n_threads; - } break; - case GGML_OP_ALIBI: - { - n_tasks[i] = 1; //TODO - } break; - case GGML_OP_CLAMP: - { - n_tasks[i] = 1; //TODO - } break; - case GGML_OP_CONV_1D: - { - n_tasks[i] = n_threads; - - GGML_ASSERT(node->src0->ne[3] == 1); - GGML_ASSERT(node->src1->ne[2] == 1); - GGML_ASSERT(node->src1->ne[3] == 1); - - size_t cur = 0; - const int nk = node->src0->ne[0]; - - if (node->src0->type == GGML_TYPE_F16 && + if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*( - nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + - ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] - ); - } else { - GGML_ASSERT(false); - } + cur = sizeof(ggml_fp16_t)*( + nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + ); + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*( + nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] + + ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1] + ); + } else { + GGML_ASSERT(false); + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CONV_2D: - { - n_tasks[i] = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_2D: + { + n_tasks = n_threads; - GGML_ASSERT(node->src1->ne[3] == 1); + GGML_ASSERT(node->src1->ne[3] == 1); - const int64_t ne00 = node->src0->ne[0]; // W - const int64_t ne01 = node->src0->ne[1]; // H - const int64_t ne02 = node->src0->ne[2]; // C - const int64_t ne03 = node->src0->ne[3]; // N + const int64_t ne00 = node->src0->ne[0]; // W + const int64_t ne01 = node->src0->ne[1]; // H + const int64_t ne02 = node->src0->ne[2]; // C + const int64_t ne03 = node->src0->ne[3]; // N - const int64_t ne10 = node->src1->ne[0]; // W - const int64_t ne11 = node->src1->ne[1]; // H - const int64_t ne12 = node->src1->ne[2]; // C + const int64_t ne10 = node->src1->ne[0]; // W + const int64_t ne11 = node->src1->ne[1]; // H + const int64_t ne12 = node->src1->ne[2]; // C - const int64_t nk = ne00*ne01; + const int64_t nk = ne00*ne01; - UNUSED(ne02); - UNUSED(ne03); - UNUSED(nk); + UNUSED(ne02); + UNUSED(ne03); + UNUSED(nk); - size_t cur = 0; + size_t cur = 0; - if (node->src0->type == GGML_TYPE_F16 && + if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); - } else if (node->src0->type == GGML_TYPE_F32 && - node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)* (ne10*ne11*ne12); - } else { - GGML_ASSERT(false); - } + cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)* (ne10*ne11*ne12); + } else { + GGML_ASSERT(false); + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_ATTN: - { - n_tasks[i] = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) - cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) - cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_FF: - { - n_tasks[i] = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_FF: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) - cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) - cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_FLASH_ATTN_BACK: - { - n_tasks[i] = n_threads; + work_size = MAX(work_size, cur); + } break; + case GGML_OP_FLASH_ATTN_BACK: + { + n_tasks = n_threads; - size_t cur = 0; + size_t cur = 0; - const int64_t D = node->src0->ne[0]; - const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); - const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back - if (node->src1->type == GGML_TYPE_F32) { - cur = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) - cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2 - } + const int64_t D = node->src0->ne[0]; + const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL); + const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back + if (node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } - if (node->src1->type == GGML_TYPE_F16) { - cur = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1) - cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2 - } + if (node->src1->type == GGML_TYPE_F16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } - work_size = MAX(work_size, cur); - } break; - case GGML_OP_WIN_PART: - case GGML_OP_WIN_UNPART: - case GGML_OP_MAP_UNARY: - case GGML_OP_MAP_BINARY: - case GGML_OP_MAP_CUSTOM1: - case GGML_OP_MAP_CUSTOM2: - case GGML_OP_MAP_CUSTOM3: - { - n_tasks[i] = 1; - } break; - case GGML_OP_CROSS_ENTROPY_LOSS: - { - n_tasks[i] = n_threads; - - size_t cur = ggml_type_size(node->type)*(n_tasks[i] + node->src0->ne[0]*n_tasks[i]); - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_CROSS_ENTROPY_LOSS_BACK: - { - n_tasks[i] = n_threads; - - size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks[i]; - - work_size = MAX(work_size, cur); - } break; - case GGML_OP_NONE: - { - n_tasks[i] = 1; - } break; - case GGML_OP_COUNT: - { - GGML_ASSERT(false); - } break; - } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: + case GGML_OP_MAP_UNARY: + case GGML_OP_MAP_BINARY: + case GGML_OP_MAP_CUSTOM1: + case GGML_OP_MAP_CUSTOM2: + case GGML_OP_MAP_CUSTOM3: + { + n_tasks = 1; + } break; + case GGML_OP_CROSS_ENTROPY_LOSS: + { + n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*(n_tasks + node->src0->ne[0]*n_tasks); + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CROSS_ENTROPY_LOSS_BACK: + { + n_tasks = n_threads; + + size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks; + + work_size = MAX(work_size, cur); + } break; + case GGML_OP_NONE: + { + n_tasks = 1; + } break; + case GGML_OP_COUNT: + { + GGML_ASSERT(false); + } break; } + + cplan.n_tasks[i] = n_tasks; } if (work_size > 0) { From 551ed0823441537c323d8769e2c59ff32f403e2d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 21:35:22 +0300 Subject: [PATCH 18/20] ggml : fix indentation in switch --- ggml.c | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/ggml.c b/ggml.c index 27232af28e0e0..69b38dc70a46d 100644 --- a/ggml.c +++ b/ggml.c @@ -16176,31 +16176,29 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning - } - else + } else #elif defined(GGML_USE_CLBLAST) - if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); - } - else + if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); + } else #endif #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) - if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { - n_tasks = 1; // TODO: this actually is doing nothing - // the threads are still spinning - if (node->src0->type != GGML_TYPE_F32) { - // here we need memory just for single 2D matrix from src0 - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); - } - } else + if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { + n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + if (node->src0->type != GGML_TYPE_F32) { + // here we need memory just for single 2D matrix from src0 + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); + } + } else #endif - if (node->src1->type != vec_dot_type) { - cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type]; - } else { - cur = 0; - } + if (node->src1->type != vec_dot_type) { + cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type]; + } else { + cur = 0; + } work_size = MAX(work_size, cur); } break; From f789f2cef2e40ef9577b29688a55be350849ea99 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Thu, 6 Jul 2023 21:54:04 +0300 Subject: [PATCH 19/20] llama : avoid unnecessary bool --- llama.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index 5221ab5a2dd27..ee6ec0920fc9c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1613,14 +1613,11 @@ static bool llama_eval_internal( // run the computation ggml_build_forward_expand(&gf, cur); - bool call_ggml_graph_compute = true; - #ifdef GGML_USE_METAL if (lctx.ctx_metal && N == 1) { ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_graph_compute(lctx.ctx_metal, &gf); ggml_metal_get_tensor (lctx.ctx_metal, cur); - call_ggml_graph_compute = false; } else { // IMPORTANT: // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla @@ -1637,12 +1634,12 @@ static bool llama_eval_internal( ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k); ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v); } - } -#endif - if (call_ggml_graph_compute) { ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); } +#else + ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads); +#endif if (cgraph_fname) { ggml_graph_export(&gf, cgraph_fname); From c15833c8d6fc6ad3a7239dc2febafca551e61f8a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 7 Jul 2023 19:13:26 +0300 Subject: [PATCH 20/20] ggml : remove comments from source file and match order in header --- ggml.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/ggml.c b/ggml.c index 69b38dc70a46d..828368e671692 100644 --- a/ggml.c +++ b/ggml.c @@ -16070,7 +16070,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } -// Prepare for graph computing. struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; @@ -16488,8 +16487,16 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) } } -// same as ggml_graph_compute() but the work data is allocated as a part of the context -// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data +void ggml_graph_reset(struct ggml_cgraph * cgraph) { + for (int i = 0; i < cgraph->n_nodes; i++) { + struct ggml_tensor * grad = cgraph->grads[i]; + + if (grad) { + ggml_set_zero(grad); + } + } +} + void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); @@ -16501,16 +16508,6 @@ void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * ggml_graph_compute(cgraph, &cplan); } -void ggml_graph_reset(struct ggml_cgraph * cgraph) { - for (int i = 0; i < cgraph->n_nodes; i++) { - struct ggml_tensor * grad = cgraph->grads[i]; - - if (grad) { - ggml_set_zero(grad); - } - } -} - struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * leaf = cgraph->leafs[i];