ggml_graph_compute: deprecate using ggml_context, try resolve issue #287

ggerganov · Jun 26, 2023 · d9876af · d9876af
1 parent 181e8d9
commit d9876af
Show file tree

Hide file tree

Showing 3 changed files with 86 additions and 31 deletions.
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1426,11 +1426,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
  gf->n_nodes = 0;
  gf->n_leafs = 0;
- gf->work_size = 0;
  gf->perf_runs = 0;
  gf->perf_cycles = 0;
  gf->perf_time_us = 0;
- gf->work = NULL;
 
  const auto & hparams = model->hparams;
  //const int n_ctx = hparams.n_ctx;

diff --git a/ggml.c b/ggml.c
@@ -16568,8 +16568,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
  /*.n_nodes =*/ 0,
  /*.n_leafs =*/ 0,
  /*.n_threads =*/ GGML_DEFAULT_N_THREADS,
- /*.work_size =*/ 0,
- /*.work =*/ NULL,
  /*.nodes =*/ { NULL },
  /*.grads =*/ { NULL },
  /*.leafs =*/ { NULL },
@@ -16740,6 +16738,7 @@ void clear_numa_thread_affinity(void) {}
 
 struct ggml_compute_state_shared {
  struct ggml_cgraph * cgraph;
+ struct ggml_cgraph_context * cgraph_ctx;
 
  int64_t perf_node_start_cycles;
  int64_t perf_node_start_time_us;
@@ -16769,6 +16768,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 static thread_ret_t ggml_graph_compute_thread(void * data) {
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
  struct ggml_cgraph * cgraph = state->shared->cgraph;
+ struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx;
 
  const int n_threads = state->shared->n_threads;
  set_numa_thread_affinity(state->ith, n_threads);
@@ -16783,8 +16783,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  /*.type =*/ GGML_TASK_FINALIZE,
  /*.ith =*/ 0,
  /*.nth =*/ 0,
- /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
- /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+ /*.wsize =*/ ctx->work_size,
+ /*.wdata =*/ ctx->work_data,
  };
 
  if (node_n != -1) {
@@ -16844,8 +16844,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  /*.type =*/ GGML_TASK_COMPUTE,
  /*.ith =*/ state->ith,
  /*.nth =*/ node->n_tasks,
- /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
- /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+ /*.wsize =*/ ctx->work_size,
+ /*.wdata =*/ ctx->work_data,
  };
 
  if (state->ith < node->n_tasks) {
@@ -16856,23 +16856,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  return 0;
 }
 
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
- const int n_threads = cgraph->n_threads;
+// Prepare for graph computing.
+// Will set: node->n_tasks, ctx->{work_size, planned}
+void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
+ GGML_ASSERT(ctx);
+ // This function is actually reentrant, but duplicate calls is unnecessary.
+ GGML_ASSERT(ctx->work_size == 0);
+ GGML_ASSERT(ctx->work_data == NULL);
+ GGML_ASSERT(!ctx->planned);
 
- struct ggml_compute_state_shared state_shared = {
- /*.cgraph =*/ cgraph,
- /*.perf_node_start_cycles =*/ 0,
- /*.perf_node_start_time_us =*/ 0,
- /*.n_threads =*/ n_threads,
- /*.n_active =*/ n_threads,
- /*.node_n =*/ -1,
- };
- struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
+ int n_threads = cgraph->n_threads;
+ size_t work_size = 0;
 
  // initialize tasks + work buffer
  {
- size_t work_size = 0;
-
  // thread scheduling for the different operations
  for (int i = 0; i < cgraph->n_nodes; i++) {
  struct ggml_tensor * node = cgraph->nodes[i];
@@ -17202,19 +17199,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
  } break;
  }
  }
+ }
 
- if (cgraph->work != NULL && work_size > cgraph->work_size) {
- GGML_ASSERT(false); // TODO: better handling
- }
+ if (work_size > 0) {
+ work_size += CACHE_LINE_SIZE*(n_threads - 1);
+ }
+
+ ctx->work_size = work_size;
+ ctx->work_data = NULL;
+ ctx->planned = true;
+}
 
- if (work_size > 0 && cgraph->work == NULL) {
- cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
+void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
+ if (ctx == NULL) {
+ ctx = alloca(sizeof(struct ggml_cgraph_context));
+ GGML_ASSERT(ctx);
+ ctx->work_size = 0;
+ ctx->work_data = NULL;
+ ctx->planned = false;
+ } else {
+ // The work_size and work_data MAY have default values even if has been planned.
+ if (ctx->work_size > 0) {
+ GGML_ASSERT(ctx->work_data);
+ }
+ }
 
- GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
- cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
+ if (!ctx->planned) {
+ ggml_graph_compute_plan(ctx, cgraph);
+ if (ctx->work_size > 0) {
+ ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8));
+ GGML_ASSERT(ctx->work_data);
+ GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size);
  }
  }
 
+ const int n_threads = cgraph->n_threads;
+
+ struct ggml_compute_state_shared state_shared = {
+ /*.cgraph =*/ cgraph,
+ /*.cgraph_ctx =*/ ctx,
+ /*.perf_node_start_cycles =*/ 0,
+ /*.perf_node_start_time_us =*/ 0,
+ /*.n_threads =*/ n_threads,
+ /*.n_active =*/ n_threads,
+ /*.node_n =*/ -1,
+ };
+ struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
+
  // create thread pool
  if (n_threads > 1) {
  for (int j = 1; j < n_threads; ++j) {
@@ -17266,6 +17297,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
  }
 }
 
+// Deprecated, keep it only for backward compatibility.
+void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
+ UNUSED(ctx);
+ ggml_graph_compute_v2(NULL, cgraph);
+}
+
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
  for (int i = 0; i < cgraph->n_nodes; i++) {
  struct ggml_tensor * grad = cgraph->grads[i];

diff --git a/ggml.h b/ggml.h
@@ -409,15 +409,23 @@ extern "C" {
 
  static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
+ // graph compute context
+ struct ggml_cgraph_context {
+ // After call to `ggml_graph_compute_plan()`, `planned` is set as true,
+ // `work_size` will be updated as non-zero when buffer is required. When
+ // need buffer, caller MUST allocate memory for `work_data`.
+ // See https://github.com/ggerganov/ggml/issues/287
+ size_t work_size;
+ void * work_data;
+ bool planned; // true means ready to compute graph nodes.
+ };
+
  // computation graph
  struct ggml_cgraph {
  int n_nodes;
  int n_leafs;
  int n_threads;
 
- size_t work_size;
- struct ggml_tensor * work;
-
  struct ggml_tensor * nodes[GGML_MAX_NODES];
  struct ggml_tensor * grads[GGML_MAX_NODES];
  struct ggml_tensor * leafs[GGML_MAX_NODES];
@@ -1270,6 +1278,18 @@ extern "C" {
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
+ // Since https://github.com/ggerganov/ggml/issues/287
+ GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
+ // Since https://github.com/ggerganov/ggml/issues/287
+ // When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`.
+ // Another use case: allocate buffer explicitly:
+ // - call `ggml_graph_compute_plan()`;
+ // - allocate memory for `ctx->work_data`;
+ // - finally call `ggml_graph_compute_v2()`.
+ // NOTE: don't manually set `ctx->planned`.
+ GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
+ // Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead.
+ // See https://github.com/ggerganov/ggml/issues/287
  GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
  GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);