diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 50e14c4ac66b2a..212f54d32cbad2 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -566,8 +566,8 @@ struct ggml_tensor * forward( // wk shape [n_embd, n_embd, 1, 1] // Qcur shape [n_embd/n_head, n_head, N, 1] // Kcur shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, 0); // store key and value to memory { @@ -823,8 +823,8 @@ struct ggml_tensor * forward_batch( // wk shape [n_embd, n_embd, 1, 1] // Qcur shape [n_embd/n_head, n_head, N, n_batch] // Kcur shape [n_embd/n_head, n_head, N, n_batch] - struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); - struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0); + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), n_past, n_rot, 0, 0); assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch); assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch); @@ -1116,7 +1116,7 @@ struct ggml_tensor * forward_lora( model->layers[il].wqb, cur)), n_embd/n_head, n_head, N), - n_past, n_rot, 0); + n_past, n_rot, 0, 0); struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, @@ -1125,7 +1125,7 @@ struct ggml_tensor * forward_lora( model->layers[il].wkb, cur)), n_embd/n_head, n_head, N), - n_past, n_rot, 0); + n_past, n_rot, 0, 0); // store key and value to memory { diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index a05881d1640e7a..350d804289a62f 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -1426,11 +1426,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train( gf->n_nodes = 0; gf->n_leafs = 0; - gf->work_size = 0; gf->perf_runs = 0; gf->perf_cycles = 0; gf->perf_time_us = 0; - gf->work = NULL; const auto & hparams = model->hparams; //const int n_ctx = hparams.n_ctx; diff --git a/ggml.c b/ggml.c index 92faf03f746a1c..71f7ebd5be82d2 100644 --- a/ggml.c +++ b/ggml.c @@ -16568,8 +16568,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) { /*.n_nodes =*/ 0, /*.n_leafs =*/ 0, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, - /*.work_size =*/ 0, - /*.work =*/ NULL, /*.nodes =*/ { NULL }, /*.grads =*/ { NULL }, /*.leafs =*/ { NULL }, @@ -16740,6 +16738,7 @@ void clear_numa_thread_affinity(void) {} struct ggml_compute_state_shared { struct ggml_cgraph * cgraph; + struct ggml_cgraph_context * cgraph_ctx; int64_t perf_node_start_cycles; int64_t perf_node_start_time_us; @@ -16769,6 +16768,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static thread_ret_t ggml_graph_compute_thread(void * data) { struct ggml_compute_state * state = (struct ggml_compute_state *) data; struct ggml_cgraph * cgraph = state->shared->cgraph; + struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx; const int n_threads = state->shared->n_threads; set_numa_thread_affinity(state->ith, n_threads); @@ -16783,8 +16783,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ ctx->work_size, + /*.wdata =*/ ctx->work_data, }; if (node_n != -1) { @@ -16844,8 +16844,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.type =*/ GGML_TASK_COMPUTE, /*.ith =*/ state->ith, /*.nth =*/ node->n_tasks, - /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0, - /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL, + /*.wsize =*/ ctx->work_size, + /*.wdata =*/ ctx->work_data, }; if (state->ith < node->n_tasks) { @@ -16856,23 +16856,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return 0; } -void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { - const int n_threads = cgraph->n_threads; +// Prepare for graph computing. +// Will set: node->n_tasks, ctx->{work_size, planned} +void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) { + GGML_ASSERT(ctx); + // This function is actually reentrant, but duplicate calls is unnecessary. + GGML_ASSERT(ctx->work_size == 0); + GGML_ASSERT(ctx->work_data == NULL); + GGML_ASSERT(!ctx->planned); - struct ggml_compute_state_shared state_shared = { - /*.cgraph =*/ cgraph, - /*.perf_node_start_cycles =*/ 0, - /*.perf_node_start_time_us =*/ 0, - /*.n_threads =*/ n_threads, - /*.n_active =*/ n_threads, - /*.node_n =*/ -1, - }; - struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + int n_threads = cgraph->n_threads; + size_t work_size = 0; // initialize tasks + work buffer { - size_t work_size = 0; - // thread scheduling for the different operations for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; @@ -17202,19 +17199,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; } } + } - if (cgraph->work != NULL && work_size > cgraph->work_size) { - GGML_ASSERT(false); // TODO: better handling - } + if (work_size > 0) { + work_size += CACHE_LINE_SIZE*(n_threads - 1); + } + + ctx->work_size = work_size; + ctx->work_data = NULL; + ctx->planned = true; +} - if (work_size > 0 && cgraph->work == NULL) { - cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1); +void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) { + if (ctx == NULL) { + ctx = alloca(sizeof(struct ggml_cgraph_context)); + GGML_ASSERT(ctx); + ctx->work_size = 0; + ctx->work_data = NULL; + ctx->planned = false; + } else { + // The work_size and work_data MAY have default values even if has been planned. + if (ctx->work_size > 0) { + GGML_ASSERT(ctx->work_data); + } + } - GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size); - cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size); + if (!ctx->planned) { + ggml_graph_compute_plan(ctx, cgraph); + if (ctx->work_size > 0) { + ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8)); + GGML_ASSERT(ctx->work_data); + GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size); } } + const int n_threads = cgraph->n_threads; + + struct ggml_compute_state_shared state_shared = { + /*.cgraph =*/ cgraph, + /*.cgraph_ctx =*/ ctx, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, + /*.n_threads =*/ n_threads, + /*.n_active =*/ n_threads, + /*.node_n =*/ -1, + }; + struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + // create thread pool if (n_threads > 1) { for (int j = 1; j < n_threads; ++j) { @@ -17266,6 +17297,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } } +// Deprecated, keep it only for backward compatibility. +void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { + UNUSED(ctx); + ggml_graph_compute_v2(NULL, cgraph); +} + void ggml_graph_reset(struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * grad = cgraph->grads[i]; diff --git a/ggml.h b/ggml.h index 459913222e0833..b828ee867920e3 100644 --- a/ggml.h +++ b/ggml.h @@ -409,15 +409,23 @@ extern "C" { static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); + // graph compute context + struct ggml_cgraph_context { + // After call to `ggml_graph_compute_plan()`, `planned` is set as true, + // `work_size` will be updated as non-zero when buffer is required. When + // need buffer, caller MUST allocate memory for `work_data`. + // See https://github.com/ggerganov/ggml/issues/287 + size_t work_size; + void * work_data; + bool planned; // true means ready to compute graph nodes. + }; + // computation graph struct ggml_cgraph { int n_nodes; int n_leafs; int n_threads; - size_t work_size; - struct ggml_tensor * work; - struct ggml_tensor * nodes[GGML_MAX_NODES]; struct ggml_tensor * grads[GGML_MAX_NODES]; struct ggml_tensor * leafs[GGML_MAX_NODES]; @@ -1270,6 +1278,18 @@ extern "C" { GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); + // Since https://github.com/ggerganov/ggml/issues/287 + GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph); + // Since https://github.com/ggerganov/ggml/issues/287 + // When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`. + // Another use case: allocate buffer explicitly: + // - call `ggml_graph_compute_plan()`; + // - allocate memory for `ctx->work_data`; + // - finally call `ggml_graph_compute_v2()`. + // NOTE: don't manually set `ctx->planned`. + GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph); + // Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead. + // See https://github.com/ggerganov/ggml/issues/287 GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);