From a1e7c6922898c013ac67e0fa517531d8367da841 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Tue, 27 Jun 2023 05:47:08 +0800
Subject: [PATCH 01/20] ggml_graph_compute: deprecate using ggml_context, try
 resolve issue #287

---
 .../train-text-from-scratch.cpp               |  2 -
 ggml.c                                        | 89 +++++++++++++------
 ggml.h                                        | 26 +++++-
 3 files changed, 86 insertions(+), 31 deletions(-)

diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index c50eeb343bcef..7f7bf3b6fed53 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -1426,11 +1426,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
 
     gf->n_nodes = 0;
     gf->n_leafs = 0;
-    gf->work_size = 0;
     gf->perf_runs = 0;
     gf->perf_cycles = 0;
     gf->perf_time_us = 0;
-    gf->work = NULL;
 
     const auto & hparams = model->hparams;
     //const int n_ctx      = hparams.n_ctx;
diff --git a/ggml.c b/ggml.c
index d257c3d657b34..0035066000af0 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15773,8 +15773,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
         /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
-        /*.work_size    =*/ 0,
-        /*.work         =*/ NULL,
         /*.nodes        =*/ { NULL },
         /*.grads        =*/ { NULL },
         /*.leafs        =*/ { NULL },
@@ -15946,6 +15944,7 @@ void clear_numa_thread_affinity(void) {}
 
 struct ggml_compute_state_shared {
     struct ggml_cgraph * cgraph;
+    struct ggml_cgraph_context * cgraph_ctx;
 
     int64_t perf_node_start_cycles;
     int64_t perf_node_start_time_us;
@@ -15975,6 +15974,7 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
     struct ggml_cgraph * cgraph = state->shared->cgraph;
+    struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx;
 
     const int n_threads = state->shared->n_threads;
     set_numa_thread_affinity(state->ith, n_threads);
@@ -15989,8 +15989,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 /*.type  =*/ GGML_TASK_FINALIZE,
                 /*.ith   =*/ 0,
                 /*.nth   =*/ 0,
-                /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+                /*.wsize =*/ ctx->work_size,
+                /*.wdata =*/ ctx->work_data,
             };
 
             if (node_n != -1) {
@@ -16057,8 +16057,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             /*.type  =*/ GGML_TASK_COMPUTE,
             /*.ith   =*/ state->ith,
             /*.nth   =*/ node->n_tasks,
-            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+            /*.wsize =*/ ctx->work_size,
+            /*.wdata =*/ ctx->work_data,
         };
 
         if (state->ith < node->n_tasks) {
@@ -16069,23 +16069,20 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     return 0;
 }
 
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    const int n_threads = cgraph->n_threads;
+// Prepare for graph computing.
+// Will set: node->n_tasks, ctx->{work_size, planned}
+void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
+    GGML_ASSERT(ctx);
+    // This function is actually reentrant, but duplicate calls is unnecessary.
+    GGML_ASSERT(ctx->work_size == 0);
+    GGML_ASSERT(ctx->work_data == NULL);
+    GGML_ASSERT(!ctx->planned);
 
-    struct ggml_compute_state_shared state_shared = {
-        /*.cgraph                  =*/ cgraph,
-        /*.perf_node_start_cycles  =*/ 0,
-        /*.perf_node_start_time_us =*/ 0,
-        /*.n_threads               =*/ n_threads,
-        /*.n_active                =*/ n_threads,
-        /*.node_n                  =*/ -1,
-    };
-    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
+    int n_threads = cgraph->n_threads;
+    size_t work_size = 0;
 
     // initialize tasks + work buffer
     {
-        size_t work_size = 0;
-
         // thread scheduling for the different operations
         for (int i = 0; i < cgraph->n_nodes; i++) {
             struct ggml_tensor * node = cgraph->nodes[i];
@@ -16399,19 +16396,53 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
             }
         }
+    }
 
-        if (cgraph->work != NULL && work_size > cgraph->work_size) {
-            GGML_ASSERT(false); // TODO: better handling
-        }
+    if (work_size > 0) {
+        work_size += CACHE_LINE_SIZE*(n_threads - 1);
+    }
+
+    ctx->work_size = work_size;
+    ctx->work_data = NULL;
+    ctx->planned = true;
+}
 
-        if (work_size > 0 && cgraph->work == NULL) {
-            cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);
+void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
+    if (ctx == NULL) {
+        ctx = alloca(sizeof(struct ggml_cgraph_context));
+        GGML_ASSERT(ctx);
+        ctx->work_size = 0;
+        ctx->work_data = NULL;
+        ctx->planned   = false;
+    } else {
+        // The work_size and work_data MAY have default values even if has been planned.
+        if (ctx->work_size > 0) {
+            GGML_ASSERT(ctx->work_data);
+        }
+    }
 
-            GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, cgraph->work_size);
-            cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
+    if (!ctx->planned) {
+        ggml_graph_compute_plan(ctx, cgraph);
+        if (ctx->work_size > 0) {
+            ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8));
+            GGML_ASSERT(ctx->work_data);
+            GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size);
         }
     }
 
+    const int n_threads = cgraph->n_threads;
+
+    struct ggml_compute_state_shared state_shared = {
+        /*.cgraph                  =*/ cgraph,
+        /*.cgraph_ctx              =*/ ctx,
+        /*.perf_node_start_cycles  =*/ 0,
+        /*.perf_node_start_time_us =*/ 0,
+        /*.n_threads               =*/ n_threads,
+        /*.n_active                =*/ n_threads,
+        /*.node_n                  =*/ -1,
+    };
+    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
+
     // create thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; ++j) {
@@ -16463,6 +16494,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
     }
 }
 
+// Deprecated, keep it only for backward compatibility.
+void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
+    UNUSED(ctx);
+    ggml_graph_compute_v2(NULL, cgraph);
+}
+
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * grad = cgraph->grads[i];
diff --git a/ggml.h b/ggml.h
index 24ca8ae221c75..f949fe35f6877 100644
--- a/ggml.h
+++ b/ggml.h
@@ -437,15 +437,23 @@ extern "C" {
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
+    // graph compute context
+    struct ggml_cgraph_context {
+        // After call to `ggml_graph_compute_plan()`, `planned` is set as true,
+        // `work_size` will be updated as non-zero when buffer is required. When
+        // need buffer, caller MUST allocate memory for `work_data`.
+        // See https://github.com/ggerganov/ggml/issues/287
+        size_t work_size;
+        void * work_data;
+        bool   planned; // true means ready to compute graph nodes.
+    };
+
     // computation graph
     struct ggml_cgraph {
         int n_nodes;
         int n_leafs;
         int n_threads;
 
-        size_t work_size;
-        struct ggml_tensor * work;
-
         struct ggml_tensor * nodes[GGML_MAX_NODES];
         struct ggml_tensor * grads[GGML_MAX_NODES];
         struct ggml_tensor * leafs[GGML_MAX_NODES];
@@ -1297,6 +1305,18 @@ extern "C" {
     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
+    // Since https://github.com/ggerganov/ggml/issues/287
+    GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
+    // Since https://github.com/ggerganov/ggml/issues/287
+    // When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`.
+    // Another use case: allocate buffer explicitly:
+    // - call `ggml_graph_compute_plan()`;
+    // - allocate memory for `ctx->work_data`;
+    // - finally call `ggml_graph_compute_v2()`.
+    // NOTE: don't manually set `ctx->planned`.
+    GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
+    // Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead.
+    // See https://github.com/ggerganov/ggml/issues/287
     GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
     GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 

From b11ac01f6b3985a8f41d9a99db076982a61bfec0 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 3 Jul 2023 16:00:47 +0800
Subject: [PATCH 02/20] rewrite: no longer consider backward compitability;
 plan and make_plan

---
 examples/baby-llama/baby-llama.cpp            |  41 +++-
 examples/benchmark/benchmark-matmult.cpp      |  46 +++-
 .../train-text-from-scratch.cpp               |  41 +++-
 ggml.c                                        | 229 ++++++++++--------
 ggml.h                                        |  52 ++--
 llama.cpp                                     |  68 +++++-
 tests/test-grad0.c                            |  66 ++++-
 tests/test-opt.c                              |  28 ++-
 8 files changed, 405 insertions(+), 166 deletions(-)

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 212f54d32cbad..f147c23a205b5 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1586,7 +1586,6 @@ int main(int argc, char ** argv) {
         int n_past = 0;
 
         ggml_cgraph gf = {};
-        gf.n_threads = 1;
 
         get_example_targets_batch(ctx0, 64*ex+0,  tokens_input, targets);
 
@@ -1595,7 +1594,18 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
 
         ggml_build_forward_expand(&gf, e);
-        ggml_graph_compute(ctx0, &gf);
+
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, &gf);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
 
@@ -1611,7 +1621,18 @@ int main(int argc, char ** argv) {
         ggml_opt(ctx0, opt_params_lbfgs, e);
         //
         ggml_build_forward_expand(&gf, e);
-        ggml_graph_compute(ctx0, &gf);
+
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, &gf);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
 
@@ -1659,13 +1680,23 @@ int main(int argc, char ** argv) {
             struct ggml_context * ctx0 = ggml_init(params);
 
             ggml_cgraph gf = {};
-            gf.n_threads = 1;
 
             int n_past = 0;
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(&gf, logits);
-            ggml_graph_compute(ctx0, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }
 
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 39d15caeb7779..e4f361e13fdec 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -159,13 +159,22 @@ int main(int argc, char ** argv)  {
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf = ggml_build_forward(m11xm2);
 
-    gf.n_threads=benchmark_params.n_threads;
-    printf("cgraph->n_threads=%i\n",gf.n_threads);
+    printf("n_threads=%i\n", benchmark_params.n_threads);
 
     TENSOR_DUMP(m11);
     TENSOR_DUMP(m2);
 
-    ggml_graph_compute(ctx, &gf);
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &gf);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
 
     TENSOR_DUMP(gf.nodes[0]);
 
@@ -187,7 +196,6 @@ int main(int argc, char ** argv)  {
 
     // printf("Creating compute graph\n");
     struct ggml_cgraph gf31 = ggml_build_forward(q31);
-    gf31.n_threads=benchmark_params.n_threads;
 
     // Set up a second graph computation to make sure we override the CPU cache lines
     // printf("Creating new tensor q12 & Running quantize\n");
@@ -199,8 +207,7 @@ int main(int argc, char ** argv)  {
 
     //printf("Creating compute graph\n");
     struct ggml_cgraph gf32 = ggml_build_forward(q32);
-    gf32.n_threads=benchmark_params.n_threads;
-    printf("cgraph->n_threads=%i\n",gf31.n_threads);
+    printf("n_threads=%i\n", benchmark_params.n_threads);
 
     const int dimx = sizex;
     const int dimy = sizey;
@@ -221,14 +228,25 @@ int main(int argc, char ** argv)  {
 
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
-        ggml_graph_compute(ctx, &gf31);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, &gf31);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
+
         long long int stop = ggml_time_us();
         long long int usec = stop-start;
         double gflops = (double)(flops_per_matrix)/usec/1000.0;
         gflops_sum += gflops;
         printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
             i,
-            gf31.n_threads,
+            benchmark_params.n_threads,
             sizex, sizey, sizez, flops_per_matrix,
             usec,gflops);
 
@@ -253,7 +271,17 @@ int main(int argc, char ** argv)  {
         }
 
         // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute(ctx, &gf32);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, &gf32);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
     }
     printf("\n");
     printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 7f7bf3b6fed53..83da31531da57 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3215,9 +3215,6 @@ int main(int argc, char ** argv) {
         struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
         struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
 
-        // ggml_cgraph gf = {};
-        gf->n_threads = params.n_threads;
-        gb->n_threads = params.n_threads;
 
         get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex,  tokens_input, target_logits, target_probs);
 
@@ -3246,7 +3243,17 @@ int main(int argc, char ** argv) {
             *gb = ggml_build_backward(ctx0, gf, true);
         }
 
-        ggml_graph_compute(ctx0, gf);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, gf);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
@@ -3270,7 +3277,17 @@ int main(int argc, char ** argv) {
         model.train_samples += n_batch;
         model.train_tokens  += n_batch * n_tokens;
 
-        ggml_graph_compute(ctx0, gf);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, gf);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
 
         float error_after_opt = ggml_get_f32_1d(loss, 0);
 
@@ -3352,13 +3369,23 @@ int main(int argc, char ** argv) {
             struct ggml_context * ctx0 = ggml_init(cparams);
 
             ggml_cgraph gf = {};
-            gf.n_threads = params.n_threads;
 
             int n_past = 0;
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(&gf, logits);
-            ggml_graph_compute(ctx0, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }
 
             //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
diff --git a/ggml.c b/ggml.c
index 0035066000af0..f019774e39116 100644
--- a/ggml.c
+++ b/ggml.c
@@ -4583,14 +4583,13 @@ struct ggml_tensor * ggml_new_tensor_impl(
         /*.src0         =*/ NULL,
         /*.src1         =*/ NULL,
         /*.opt          =*/ { NULL },
-        /*.n_tasks      =*/ 0,
         /*.perf_runs    =*/ 0,
         /*.perf_cycles  =*/ 0,
         /*.perf_time_us =*/ 0,
         /*.data         =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
-        /*.pad          =*/ { 0 },
+        /*.padding      =*/ { 0 },
     };
 
     // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
@@ -15772,7 +15771,6 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
     struct ggml_cgraph result = {
         /*.n_nodes      =*/ 0,
         /*.n_leafs      =*/ 0,
-        /*.n_threads    =*/ GGML_DEFAULT_N_THREADS,
         /*.nodes        =*/ { NULL },
         /*.grads        =*/ { NULL },
         /*.leafs        =*/ { NULL },
@@ -15944,7 +15942,7 @@ void clear_numa_thread_affinity(void) {}
 
 struct ggml_compute_state_shared {
     struct ggml_cgraph * cgraph;
-    struct ggml_cgraph_context * cgraph_ctx;
+    struct ggml_graph_compute_plan * cgraph_ctx;
 
     int64_t perf_node_start_cycles;
     int64_t perf_node_start_time_us;
@@ -15974,7 +15972,9 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
     struct ggml_cgraph * cgraph = state->shared->cgraph;
-    struct ggml_cgraph_context * ctx = state->shared->cgraph_ctx;
+
+    struct ggml_graph_compute_plan * ctx = state->shared->cgraph_ctx;
+    const int *n_tasks_arr = ctx->n_tasks;
 
     const int n_threads = state->shared->n_threads;
     set_numa_thread_affinity(state->ith, n_threads);
@@ -15997,7 +15997,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 /* FINALIZE */
                 struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
                 if (GGML_OP_HAS_FINALIZE[node->op]) {
-                    params.nth = node->n_tasks;
+                    params.nth = n_tasks_arr[node_n];
                     ggml_compute_forward(&params, node);
                     ggml_graph_compute_perf_stats_node(node, state->shared);
                 }
@@ -16008,11 +16008,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
 
                 struct ggml_tensor * node = cgraph->nodes[node_n];
+                const int n_tasks = n_tasks_arr[node_n];
 
                 state->shared->perf_node_start_cycles  = ggml_perf_cycles();
                 state->shared->perf_node_start_time_us = ggml_perf_time_us();
 
-                params.nth = node->n_tasks;
+                params.nth = n_tasks;
 
                 /* INIT */
                 if (GGML_OP_HAS_INIT[node->op]) {
@@ -16020,7 +16021,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                     ggml_compute_forward(&params, node);
                 }
 
-                if (node->n_tasks == 1) {
+                if (n_tasks == 1) {
                     // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
                     // they do something more efficient than spinning (?)
                     params.type = GGML_TASK_COMPUTE;
@@ -16052,16 +16053,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 
         /* COMPUTE */
         struct ggml_tensor * node = cgraph->nodes[node_n];
+        const int n_tasks = n_tasks_arr[node_n];
 
         struct ggml_compute_params params = {
             /*.type  =*/ GGML_TASK_COMPUTE,
             /*.ith   =*/ state->ith,
-            /*.nth   =*/ node->n_tasks,
+            /*.nth   =*/ n_tasks,
             /*.wsize =*/ ctx->work_size,
             /*.wdata =*/ ctx->work_data,
         };
 
-        if (state->ith < node->n_tasks) {
+        if (state->ith < n_tasks) {
             ggml_compute_forward(&params, node);
         }
     }
@@ -16070,15 +16072,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }
 
 // Prepare for graph computing.
-// Will set: node->n_tasks, ctx->{work_size, planned}
-void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
-    GGML_ASSERT(ctx);
-    // This function is actually reentrant, but duplicate calls is unnecessary.
-    GGML_ASSERT(ctx->work_size == 0);
-    GGML_ASSERT(ctx->work_data == NULL);
-    GGML_ASSERT(!ctx->planned);
-
-    int n_threads = cgraph->n_threads;
+struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, int n_threads) {
+    if (n_threads <= 0) {
+        n_threads = GGML_DEFAULT_N_THREADS;
+    }
+
+    struct ggml_graph_compute_plan ctx;
+    memset(&ctx, 0, sizeof(struct ggml_graph_compute_plan));
+    int * n_tasks = ctx.n_tasks;
     size_t work_size = 0;
 
     // initialize tasks + work buffer
@@ -16091,11 +16092,11 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                 case GGML_OP_CPY:
                 case GGML_OP_DUP:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
                         size_t cur = 0;
                         if (ggml_is_quantized(node->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks[i];
                         }
 
                         work_size = MAX(work_size, cur);
@@ -16103,24 +16104,24 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                 case GGML_OP_ADD:
                 case GGML_OP_ADD1:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
                         size_t cur = 0;
 
                         if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks[i];
                         }
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_ACC:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
                         size_t cur = 0;
 
                         if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_threads;
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks[i];
                         }
 
                         work_size = MAX(work_size, cur);
@@ -16144,7 +16145,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                 case GGML_OP_ELU:
                 case GGML_OP_RELU:
                     {
-                        node->n_tasks = 1;
+                        n_tasks[i] = 1;
                     } break;
                 case GGML_OP_MUL:
                 case GGML_OP_GELU:
@@ -16155,32 +16156,32 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                 case GGML_OP_RMS_NORM:
                 case GGML_OP_RMS_NORM_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
                     } break;
                 case GGML_OP_MUL_MAT:
                 case GGML_OP_OUT_PROD:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
                         // TODO: use different scheduling for different matrix sizes
                         //const int nr0 = ggml_nrows(node->src0);
                         //const int nr1 = ggml_nrows(node->src1);
 
-                        //node->n_tasks = MIN(n_threads, MAX(1, nr0/128));
-                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, node->n_tasks);
+                        //n_tasks[i] = MIN(n_threads, MAX(1, nr0/128));
+                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, n_tasks[i]);
 
                         size_t cur = 0;
                         const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
 
 #if defined(GGML_USE_CUBLAS)
                         if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                            n_tasks[i] = 1; // TODO: this actually is doing nothing
                                                 //       the threads are still spinning
                         }
                         else
 #elif defined(GGML_USE_CLBLAST)
                         if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                            n_tasks[i] = 1; // TODO: this actually is doing nothing
                                                 //       the threads are still spinning
                             cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
                         }
@@ -16188,7 +16189,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
 #endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                         if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                            node->n_tasks = 1; // TODO: this actually is doing nothing
+                            n_tasks[i] = 1; // TODO: this actually is doing nothing
                                                //       the threads are still spinning
                             if (node->src0->type != GGML_TYPE_F32) {
                                 // here we need memory just for single 2D matrix from src0
@@ -16206,7 +16207,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                     } break;
                 case GGML_OP_SCALE:
                     {
-                        node->n_tasks = 1;
+                        n_tasks[i] = 1;
                     } break;
                 case GGML_OP_SET:
                 case GGML_OP_CONT:
@@ -16219,7 +16220,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                 case GGML_OP_DIAG:
                 case GGML_OP_DIAG_MASK_ZERO:
                     {
-                        node->n_tasks = 1;
+                        n_tasks[i] = 1;
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                 case GGML_OP_SOFT_MAX:
@@ -16227,19 +16228,19 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                 case GGML_OP_ROPE:
                 case GGML_OP_ROPE_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
                     } break;
                 case GGML_OP_ALIBI:
                     {
-                        node->n_tasks = 1; //TODO
+                        n_tasks[i] = 1; //TODO
                     } break;
                 case GGML_OP_CLAMP:
                     {
-                        node->n_tasks = 1; //TODO
+                        n_tasks[i] = 1; //TODO
                     } break;
                 case GGML_OP_CONV_1D:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
                         GGML_ASSERT(node->src0->ne[3] == 1);
                         GGML_ASSERT(node->src1->ne[2] == 1);
@@ -16268,7 +16269,7 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                     } break;
                 case GGML_OP_CONV_2D:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
                         GGML_ASSERT(node->src1->ne[3] == 1);
 
@@ -16303,45 +16304,45 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                     } break;
                 case GGML_OP_FLASH_ATTN:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
                         size_t cur = 0;
 
                         const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
 
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_FLASH_FF:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
                         size_t cur = 0;
 
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_FLASH_ATTN_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
                         size_t cur = 0;
 
@@ -16349,13 +16350,13 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                         const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
                         const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
                         if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2
                         }
 
                         if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*mxDn*node->n_tasks; // TODO: this can become (n_tasks-1)
-                            cur += sizeof(float)*mxDn*node->n_tasks; // this is overestimated by x2
+                            cur  = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
+                            cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2
                         }
 
                         work_size = MAX(work_size, cur);
@@ -16368,27 +16369,27 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
                 case GGML_OP_MAP_CUSTOM2:
                 case GGML_OP_MAP_CUSTOM3:
                     {
-                        node->n_tasks = 1;
+                        n_tasks[i] = 1;
                     } break;
                 case GGML_OP_CROSS_ENTROPY_LOSS:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
-                        size_t cur = ggml_type_size(node->type)*(node->n_tasks + node->src0->ne[0]*node->n_tasks);
+                        size_t cur = ggml_type_size(node->type)*(n_tasks[i] + node->src0->ne[0]*n_tasks[i]);
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
                     {
-                        node->n_tasks = n_threads;
+                        n_tasks[i] = n_threads;
 
-                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*node->n_tasks;
+                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks[i];
 
                         work_size = MAX(work_size, cur);
                     } break;
                 case GGML_OP_NONE:
                     {
-                        node->n_tasks = 1;
+                        n_tasks[i] = 1;
                     } break;
                 case GGML_OP_COUNT:
                     {
@@ -16402,35 +16403,31 @@ void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgrap
         work_size += CACHE_LINE_SIZE*(n_threads - 1);
     }
 
-    ctx->work_size = work_size;
-    ctx->work_data = NULL;
-    ctx->planned = true;
+    ctx.n_threads = n_threads;
+    ctx.work_size = work_size;
+    ctx.work_data = NULL;
+
+    return ctx;
 }
 
-void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph) {
-    if (ctx == NULL) {
-        ctx = alloca(sizeof(struct ggml_cgraph_context));
+void ggml_graph_compute(struct ggml_graph_compute_plan * ctx, struct ggml_cgraph * cgraph) {
+    {
         GGML_ASSERT(ctx);
-        ctx->work_size = 0;
-        ctx->work_data = NULL;
-        ctx->planned   = false;
-    } else {
-        // The work_size and work_data MAY have default values even if has been planned.
+        GGML_ASSERT(ctx->n_threads > 0);
+
         if (ctx->work_size > 0) {
             GGML_ASSERT(ctx->work_data);
         }
-    }
 
-    if (!ctx->planned) {
-        ggml_graph_compute_plan(ctx, cgraph);
-        if (ctx->work_size > 0) {
-            ctx->work_data = malloc(ctx->work_size * sizeof(GGML_TYPE_I8));
-            GGML_ASSERT(ctx->work_data);
-            GGML_PRINT_DEBUG("%s: allocating work buffer for graph (%zu bytes)\n", __func__, work_size);
+        for (int i = 0; i < cgraph->n_nodes; ++i) {
+            if (cgraph->nodes[i]->op != GGML_OP_NONE) {
+                GGML_ASSERT(ctx->n_tasks[i] > 0);
+            }
         }
+
     }
 
-    const int n_threads = cgraph->n_threads;
+    const int n_threads = ctx->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
         /*.cgraph                  =*/ cgraph,
@@ -16494,12 +16491,6 @@ void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph
     }
 }
 
-// Deprecated, keep it only for backward compatibility.
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    UNUSED(ctx);
-    ggml_graph_compute_v2(NULL, cgraph);
-}
-
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * grad = cgraph->grads[i];
@@ -16548,14 +16539,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
     const int64_t * ne = tensor->ne;
     const size_t  * nb = tensor->nb;
 
-    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %8d %16p %32s\n",
+    fprintf(fout, "%-6s %-6s %-12s %8d %" PRId64 " %" PRId64 " %" PRId64 " %" PRId64 " %16zu %16zu %16zu %16zu %16p %32s\n",
             arg,
             ggml_type_name(tensor->type),
             ggml_op_name  (tensor->op),
             tensor->n_dims,
             ne[0], ne[1], ne[2], ne[3],
             nb[0], nb[1], nb[2], nb[3],
-            tensor->n_tasks,
             tensor->data,
             tensor->name);
 }
@@ -17283,7 +17273,6 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g
 //
 
 static enum ggml_opt_result ggml_opt_adam(
-        struct ggml_context * ctx,
         struct ggml_opt_context * opt,
         struct ggml_opt_params params,
         struct ggml_tensor * f,
@@ -17291,9 +17280,6 @@ static enum ggml_opt_result ggml_opt_adam(
         struct ggml_cgraph * gb) {
     GGML_ASSERT(ggml_is_scalar(f));
 
-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
     // these will store the parameters we want to optimize
     struct ggml_tensor * ps[GGML_MAX_PARAMS];
 
@@ -17340,7 +17326,18 @@ static enum ggml_opt_result ggml_opt_adam(
     // compute the function value
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute(ctx, gb);
+
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, gb);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
 
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
@@ -17420,7 +17417,18 @@ static enum ggml_opt_result ggml_opt_adam(
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
+
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, gb);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
 
         const float fx = ggml_get_f32_1d(f, 0);
 
@@ -17491,7 +17499,6 @@ struct ggml_lbfgs_iteration_data {
 };
 
 static enum ggml_opt_result linesearch_backtracking(
-        struct ggml_context * ctx,
         const struct ggml_opt_params * params,
         int nx,
         float * x,
@@ -17542,7 +17549,18 @@ static enum ggml_opt_result linesearch_backtracking(
 
             ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(ctx, gb);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params->n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, gb);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }
 
             ggml_opt_get_grad(np, ps, g);
 
@@ -17610,9 +17628,6 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         }
     }
 
-    gf->n_threads = params.n_threads;
-    gb->n_threads = params.n_threads;
-
     const int m = params.lbfgs.m;
 
     // these will store the parameters we want to optimize
@@ -17664,7 +17679,17 @@ static enum ggml_opt_result ggml_opt_lbfgs(
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
-        ggml_graph_compute(ctx, gb);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, gb);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
 
         ggml_opt_get_grad(np, ps, g);
 
@@ -17723,7 +17748,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
 
-        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
+        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
 
         if (ls < 0) {
             // linesearch failed - go back to the previous point and return
@@ -18025,7 +18050,7 @@ enum ggml_opt_result ggml_opt_resume_g(
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
+                result = ggml_opt_adam(opt, opt->params, f, gf, gb);
             } break;
         case GGML_OPT_LBFGS:
             {
diff --git a/ggml.h b/ggml.h
index f949fe35f6877..f92f428fa2090 100644
--- a/ggml.h
+++ b/ggml.h
@@ -65,7 +65,16 @@
 //       ggml_set_f32(a, 3.0f);
 //       ggml_set_f32(b, 4.0f);
 //
-//       ggml_graph_compute(ctx0, &gf);
+//       const int n_threads = 1;
+//       struct ggml_graph_compute_plan ctx = ggml_graph_compute_make_plan(&gf, n_threads);
+//       if (ctx.work_size > 0) {
+//           ctx.work_data = malloc(ctx.work_size);
+//           GGML_ASSERT(ctx.work_data);
+//       }
+//       ggml_graph_compute(&ctx, &gf);
+//       if (ctx.work_data) {
+//           free(ctx.work_data);
+//       }
 //
 //       printf("f = %f\n", ggml_get_f32_1d(f, 0));
 //
@@ -418,9 +427,6 @@ extern "C" {
         struct ggml_tensor * src1;
         struct ggml_tensor * opt[GGML_MAX_OPT];
 
-        // thread scheduling
-        int n_tasks;
-
         // performance
         int     perf_runs;
         int64_t perf_cycles;
@@ -432,27 +438,30 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[4];
+        char padding[8];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
-    // graph compute context
-    struct ggml_cgraph_context {
-        // After call to `ggml_graph_compute_plan()`, `planned` is set as true,
-        // `work_size` will be updated as non-zero when buffer is required. When
-        // need buffer, caller MUST allocate memory for `work_data`.
-        // See https://github.com/ggerganov/ggml/issues/287
+    // The default graph compute plan that needs to be prepared for ggml_graph_compute().
+    // Since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_graph_compute_plan {
+        // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
         size_t work_size;
+        // Worker buffer.
+        // Expect allocate/free by caller before/after calling to `ggml_graph_compute()`.
         void * work_data;
-        bool   planned; // true means ready to compute graph nodes.
+
+        int n_threads;
+
+        // The `n_tasks` of nodes, 1:1 mapping to cgraph nodes.
+        int n_tasks[GGML_MAX_NODES];
     };
 
     // computation graph
     struct ggml_cgraph {
         int n_nodes;
         int n_leafs;
-        int n_threads;
 
         struct ggml_tensor * nodes[GGML_MAX_NODES];
         struct ggml_tensor * grads[GGML_MAX_NODES];
@@ -1305,19 +1314,10 @@ extern "C" {
     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
-    // Since https://github.com/ggerganov/ggml/issues/287
-    GGML_API void ggml_graph_compute_plan(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
-    // Since https://github.com/ggerganov/ggml/issues/287
-    // When `ctx` is NULL, `ggml_graph_compute_v2()` calculates work_size and allocates memory for `work_data`.
-    // Another use case: allocate buffer explicitly:
-    // - call `ggml_graph_compute_plan()`;
-    // - allocate memory for `ctx->work_data`;
-    // - finally call `ggml_graph_compute_v2()`.
-    // NOTE: don't manually set `ctx->planned`.
-    GGML_API void ggml_graph_compute_v2(struct ggml_cgraph_context * ctx, struct ggml_cgraph * cgraph);
-    // Deprecated, `ctx` is not required. Use `ggml_graph_compute_v2` instead.
-    // See https://github.com/ggerganov/ggml/issues/287
-    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    // ggml_graph_compute_make_plan() needs to be called before ggml_graph_compute().
+    // Returns a plan object. When plan.work_size > 0, caller must allocate memory for plan.work_data.
+    GGML_API struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, const int n_threads/*=GGML_DEFAULT_N_THREADS*/);
+    GGML_API void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph);
     GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 
     GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
diff --git a/llama.cpp b/llama.cpp
index 02afdeb14078f..d1ae57298b3eb 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1309,7 +1309,7 @@ static bool llama_eval_internal(
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
     ggml_cgraph gf = {};
-    gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    const int actual_n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
@@ -1612,10 +1612,30 @@ static bool llama_eval_internal(
             ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
         }
 
-        ggml_graph_compute(ctx0, &gf);
+        {
+            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
+            if (plan.work_size > 0) {
+                plan.work_data = malloc(plan.work_size);
+                GGML_ASSERT(plan.work_data);
+            }
+            ggml_graph_compute(&plan, &gf);
+            if (plan.work_data) {
+                free(plan.work_data);
+            }
+        }
     }
 #else
-    ggml_graph_compute(ctx0, &gf);
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &gf);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
 #endif
 
     if (cgraph_fname) {
@@ -2966,8 +2986,18 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
             }
 
             struct ggml_cgraph gf = ggml_build_forward(r);
-            gf.n_threads = n_threads;
-            ggml_graph_compute(lora_ctx, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }
 
             // we won't need these tensors again, reset the context to save memory
             ggml_free(lora_ctx);
@@ -3120,7 +3150,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
             ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
             ggml_cgraph gf{};
-            gf.n_threads = 1;
 
             ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
             kout3d->data = out;
@@ -3140,7 +3169,18 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }
 
             ggml_free(cpy_ctx);
         }
@@ -3226,7 +3266,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
             ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
             ggml_cgraph gf{};
-            gf.n_threads = 1;
 
             ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
             kin3d->data = (void *) inp;
@@ -3246,7 +3285,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            ggml_graph_compute(cpy_ctx, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }
 
             ggml_free(cpy_ctx);
         }
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index a3e25214b84eb..11bb2307f627e 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -215,15 +215,36 @@ bool check_gradient(
     }
 
     struct ggml_cgraph gf = ggml_build_forward (f);
-    gf.n_threads = n_threads;
 
     struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
-    gb.n_threads = n_threads;
 
     ggml_graph_compute(ctx0, &gf);
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &gf);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
+
     ggml_graph_reset  (&gf);
     ggml_set_f32      (f->grad, 1.0f);
-    ggml_graph_compute(ctx0, &gb);
+
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &gb);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
 
     // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
     // ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
@@ -236,12 +257,34 @@ bool check_gradient(
             const float xm = x0 - eps;
             const float xp = x0 + eps;
             set_element(x[i], k, xp);
-            ggml_graph_compute(ctx0, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }
 
             const float f0 = ggml_get_f32_1d(f, 0);
 
             set_element(x[i], k, xm);
-            ggml_graph_compute(ctx0, &gf);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gf);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }
 
             const float f1 = ggml_get_f32_1d(f, 0);
 
@@ -252,7 +295,18 @@ bool check_gradient(
             // compute gradient using backward graph
             ggml_graph_reset  (&gf);
             ggml_set_f32      (f->grad, 1.0f);
-            ggml_graph_compute(ctx0, &gb);
+
+            {
+                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
+                if (plan.work_size > 0) {
+                    plan.work_data = malloc(plan.work_size);
+                    GGML_ASSERT(plan.work_data);
+                }
+                ggml_graph_compute(&plan, &gb);
+                if (plan.work_data) {
+                    free(plan.work_data);
+                }
+            }
 
             const float g1 = get_element(x[i]->grad, k);
 
diff --git a/tests/test-opt.c b/tests/test-opt.c
index d001615ee353b..cb0d58199991a 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -140,7 +140,19 @@ int main(int argc, const char ** argv) {
 
     struct ggml_cgraph ge = ggml_build_forward(e);
     ggml_graph_reset  (&ge);
-    ggml_graph_compute(ctx, &ge);
+
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &ge);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
+
     const float fe = ggml_get_f32_1d(e, 0);
     printf("%s: e = %.4f\n", __func__, fe);
 
@@ -149,7 +161,19 @@ int main(int argc, const char ** argv) {
     ggml_opt(ctx, opt_params, e);
 
     ggml_graph_reset  (&ge);
-    ggml_graph_compute(ctx, &ge);
+
+    {
+        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
+        if (plan.work_size > 0) {
+            plan.work_data = malloc(plan.work_size);
+            GGML_ASSERT(plan.work_data);
+        }
+        ggml_graph_compute(&plan, &ge);
+        if (plan.work_data) {
+            free(plan.work_data);
+        }
+    }
+
     const float fe_opt = ggml_get_f32_1d(e, 0);
     printf("%s: original  e = %.4f\n", __func__, fe);
     printf("%s: optimized e = %.4f\n", __func__, fe_opt);

From a37de23953ed794e1f8b100156b31f909c245edb Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 3 Jul 2023 16:22:52 +0800
Subject: [PATCH 03/20] minor: rename ctx as plan; const

---
 ggml.c | 50 +++++++++++++++++++++++++-------------------------
 ggml.h | 14 +++++++-------
 2 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/ggml.c b/ggml.c
index f019774e39116..4968f36c25edb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15941,13 +15941,13 @@ void clear_numa_thread_affinity(void) {}
 #endif
 
 struct ggml_compute_state_shared {
-    struct ggml_cgraph * cgraph;
-    struct ggml_graph_compute_plan * cgraph_ctx;
+    const struct ggml_cgraph * cgraph;
+    const struct ggml_graph_compute_plan * plan;
 
     int64_t perf_node_start_cycles;
     int64_t perf_node_start_time_us;
 
-    int n_threads;
+    const int n_threads;
 
     // synchronization primitives
     atomic_int n_active; // num active threads
@@ -15971,10 +15971,10 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-    struct ggml_cgraph * cgraph = state->shared->cgraph;
+    const struct ggml_cgraph * cgraph = state->shared->cgraph;
 
-    struct ggml_graph_compute_plan * ctx = state->shared->cgraph_ctx;
-    const int *n_tasks_arr = ctx->n_tasks;
+    const struct ggml_graph_compute_plan * plan = state->shared->plan;
+    const int *n_tasks_arr = plan->n_tasks;
 
     const int n_threads = state->shared->n_threads;
     set_numa_thread_affinity(state->ith, n_threads);
@@ -15989,8 +15989,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 /*.type  =*/ GGML_TASK_FINALIZE,
                 /*.ith   =*/ 0,
                 /*.nth   =*/ 0,
-                /*.wsize =*/ ctx->work_size,
-                /*.wdata =*/ ctx->work_data,
+                /*.wsize =*/ plan->work_size,
+                /*.wdata =*/ plan->work_data,
             };
 
             if (node_n != -1) {
@@ -16059,8 +16059,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             /*.type  =*/ GGML_TASK_COMPUTE,
             /*.ith   =*/ state->ith,
             /*.nth   =*/ n_tasks,
-            /*.wsize =*/ ctx->work_size,
-            /*.wdata =*/ ctx->work_data,
+            /*.wsize =*/ plan->work_size,
+            /*.wdata =*/ plan->work_data,
         };
 
         if (state->ith < n_tasks) {
@@ -16077,9 +16077,9 @@ struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph *
         n_threads = GGML_DEFAULT_N_THREADS;
     }
 
-    struct ggml_graph_compute_plan ctx;
-    memset(&ctx, 0, sizeof(struct ggml_graph_compute_plan));
-    int * n_tasks = ctx.n_tasks;
+    struct ggml_graph_compute_plan plan;
+    memset(&plan, 0, sizeof(struct ggml_graph_compute_plan));
+    int * n_tasks = plan.n_tasks;
     size_t work_size = 0;
 
     // initialize tasks + work buffer
@@ -16403,35 +16403,35 @@ struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph *
         work_size += CACHE_LINE_SIZE*(n_threads - 1);
     }
 
-    ctx.n_threads = n_threads;
-    ctx.work_size = work_size;
-    ctx.work_data = NULL;
+    plan.n_threads = n_threads;
+    plan.work_size = work_size;
+    plan.work_data = NULL;
 
-    return ctx;
+    return plan;
 }
 
-void ggml_graph_compute(struct ggml_graph_compute_plan * ctx, struct ggml_cgraph * cgraph) {
+void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph) {
     {
-        GGML_ASSERT(ctx);
-        GGML_ASSERT(ctx->n_threads > 0);
+        GGML_ASSERT(plan);
+        GGML_ASSERT(plan->n_threads > 0);
 
-        if (ctx->work_size > 0) {
-            GGML_ASSERT(ctx->work_data);
+        if (plan->work_size > 0) {
+            GGML_ASSERT(plan->work_data);
         }
 
         for (int i = 0; i < cgraph->n_nodes; ++i) {
             if (cgraph->nodes[i]->op != GGML_OP_NONE) {
-                GGML_ASSERT(ctx->n_tasks[i] > 0);
+                GGML_ASSERT(plan->n_tasks[i] > 0);
             }
         }
 
     }
 
-    const int n_threads = ctx->n_threads;
+    const int n_threads = plan->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
         /*.cgraph                  =*/ cgraph,
-        /*.cgraph_ctx              =*/ ctx,
+        /*.cgraph_plan             =*/ plan,
         /*.perf_node_start_cycles  =*/ 0,
         /*.perf_node_start_time_us =*/ 0,
         /*.n_threads               =*/ n_threads,
diff --git a/ggml.h b/ggml.h
index f92f428fa2090..fae63e6312c41 100644
--- a/ggml.h
+++ b/ggml.h
@@ -66,14 +66,14 @@
 //       ggml_set_f32(b, 4.0f);
 //
 //       const int n_threads = 1;
-//       struct ggml_graph_compute_plan ctx = ggml_graph_compute_make_plan(&gf, n_threads);
-//       if (ctx.work_size > 0) {
-//           ctx.work_data = malloc(ctx.work_size);
-//           GGML_ASSERT(ctx.work_data);
+//       struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
+//       if (plan.work_size > 0) {
+//           plan.work_data = malloc(plan.work_size);
+//           GGML_ASSERT(plan.work_data);
 //       }
-//       ggml_graph_compute(&ctx, &gf);
-//       if (ctx.work_data) {
-//           free(ctx.work_data);
+//       ggml_graph_compute(&plan, &gf);
+//       if (plan.work_data) {
+//           free(plan.work_data);
 //       }
 //
 //       printf("f = %f\n", ggml_get_f32_1d(f, 0));

From db81f33ef2067d000576fbc24d3e32a5f5d1ba0e Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 3 Jul 2023 18:10:00 +0800
Subject: [PATCH 04/20] remove ggml_graph_compute from tests/test-grad0.c, but
 current change breaks backward

---
 tests/test-grad0.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 11bb2307f627e..477fedfeef63e 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -218,7 +218,6 @@ bool check_gradient(
 
     struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
 
-    ggml_graph_compute(ctx0, &gf);
     {
         struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
         if (plan.work_size > 0) {

From 2b502c32caa5f5f5a996bd8ba73ae8120767eb9b Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 3 Jul 2023 20:28:07 +0800
Subject: [PATCH 05/20] add static ggml_graph_compute_sugar()

---
 ggml.c | 64 +++++++++++++++++-----------------------------------------
 1 file changed, 19 insertions(+), 45 deletions(-)

diff --git a/ggml.c b/ggml.c
index 4968f36c25edb..0e906d0c3dde7 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16424,7 +16424,6 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
                 GGML_ASSERT(plan->n_tasks[i] > 0);
             }
         }
-
     }
 
     const int n_threads = plan->n_threads;
@@ -16491,6 +16490,20 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
     }
 }
 
+static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) {
+    struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads);
+    if (plan.work_size > 0) {
+        plan.work_data = malloc(plan.work_size);
+        GGML_ASSERT(plan.work_data);
+    }
+
+    ggml_graph_compute(&plan, cgraph);
+
+    if (plan.work_data) {
+        free(plan.work_data);
+    }
+}
+
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
     for (int i = 0; i < cgraph->n_nodes; i++) {
         struct ggml_tensor * grad = cgraph->grads[i];
@@ -17327,17 +17340,7 @@ static enum ggml_opt_result ggml_opt_adam(
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
 
-    {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
-        if (plan.work_size > 0) {
-            plan.work_data = malloc(plan.work_size);
-            GGML_ASSERT(plan.work_data);
-        }
-        ggml_graph_compute(&plan, gb);
-        if (plan.work_data) {
-            free(plan.work_data);
-        }
-    }
+    ggml_graph_compute_sugar(gb, params.n_threads);
 
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
@@ -17418,17 +17421,7 @@ static enum ggml_opt_result ggml_opt_adam(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
-            if (plan.work_size > 0) {
-                plan.work_data = malloc(plan.work_size);
-                GGML_ASSERT(plan.work_data);
-            }
-            ggml_graph_compute(&plan, gb);
-            if (plan.work_data) {
-                free(plan.work_data);
-            }
-        }
+        ggml_graph_compute_sugar(gb, params.n_threads);
 
         const float fx = ggml_get_f32_1d(f, 0);
 
@@ -17550,17 +17543,7 @@ static enum ggml_opt_result linesearch_backtracking(
             ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
 
-            {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params->n_threads);
-                if (plan.work_size > 0) {
-                    plan.work_data = malloc(plan.work_size);
-                    GGML_ASSERT(plan.work_data);
-                }
-                ggml_graph_compute(&plan, gb);
-                if (plan.work_data) {
-                    free(plan.work_data);
-                }
-            }
+            ggml_graph_compute_sugar(gb, params->n_threads);
 
             ggml_opt_get_grad(np, ps, g);
 
@@ -17679,17 +17662,8 @@ static enum ggml_opt_result ggml_opt_lbfgs(
 
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
-        {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gb, params.n_threads);
-            if (plan.work_size > 0) {
-                plan.work_data = malloc(plan.work_size);
-                GGML_ASSERT(plan.work_data);
-            }
-            ggml_graph_compute(&plan, gb);
-            if (plan.work_data) {
-                free(plan.work_data);
-            }
-        }
+
+        ggml_graph_compute_sugar(gb, params.n_threads);
 
         ggml_opt_get_grad(np, ps, g);
 

From cb1dec0ec04228aa13137caebe951cd7843a6816 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Mon, 3 Jul 2023 23:58:31 +0800
Subject: [PATCH 06/20] minor: update comments

---
 ggml.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ggml.h b/ggml.h
index fae63e6312c41..0f1bd138bff29 100644
--- a/ggml.h
+++ b/ggml.h
@@ -448,8 +448,7 @@ extern "C" {
     struct ggml_graph_compute_plan {
         // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
         size_t work_size;
-        // Worker buffer.
-        // Expect allocate/free by caller before/after calling to `ggml_graph_compute()`.
+        // Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
         void * work_data;
 
         int n_threads;

From b1331d7e604eeae9b9b0e4f7b3a50b70b49c1b44 Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Tue, 4 Jul 2023 20:38:46 +0800
Subject: [PATCH 07/20] reusable buffers

---
 examples/baby-llama/baby-llama.cpp            | 23 +++----
 examples/benchmark/benchmark-matmult.cpp      | 29 ++++-----
 .../train-text-from-scratch.cpp               | 29 ++++-----
 ggml.c                                        |  3 +-
 ggml.h                                        |  2 +-
 llama.cpp                                     | 64 ++++++++-----------
 tests/test-grad0.c                            | 64 +++++++++++--------
 tests/test-opt.c                              | 46 +++++++++----
 8 files changed, 126 insertions(+), 134 deletions(-)

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index f147c23a205b5..785e7e8860fff 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1569,6 +1569,8 @@ int main(int argc, char ** argv) {
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
 
+    auto compute_plan_buffer = std::vector<uint8_t>();
+
     for (int ex=0; ex<n_examples; ++ex) {
         struct ggml_init_params params = {
             /*.mem_size   =*/ compute_size,
@@ -1598,13 +1600,10 @@ int main(int argc, char ** argv) {
         {
             struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
             if (plan.work_size > 0) {
-                plan.work_data = malloc(plan.work_size);
-                GGML_ASSERT(plan.work_data);
+                compute_plan_buffer.resize(plan.work_size);
+                plan.work_data = compute_plan_buffer.data();
             }
             ggml_graph_compute(&plan, &gf);
-            if (plan.work_data) {
-                free(plan.work_data);
-            }
         }
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
@@ -1625,13 +1624,10 @@ int main(int argc, char ** argv) {
         {
             struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
             if (plan.work_size > 0) {
-                plan.work_data = malloc(plan.work_size);
-                GGML_ASSERT(plan.work_data);
+                compute_plan_buffer.resize(plan.work_size);
+                plan.work_data = compute_plan_buffer.data();
             }
             ggml_graph_compute(&plan, &gf);
-            if (plan.work_data) {
-                free(plan.work_data);
-            }
         }
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
@@ -1689,13 +1685,10 @@ int main(int argc, char ** argv) {
             {
                 struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
                 if (plan.work_size > 0) {
-                    plan.work_data = malloc(plan.work_size);
-                    GGML_ASSERT(plan.work_data);
+                    compute_plan_buffer.resize(plan.work_size);
+                    plan.work_data = compute_plan_buffer.data();
                 }
                 ggml_graph_compute(&plan, &gf);
-                if (plan.work_data) {
-                    free(plan.work_data);
-                }
             }
 
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index e4f361e13fdec..e7d75c9ae51e0 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -164,16 +164,15 @@ int main(int argc, char ** argv)  {
     TENSOR_DUMP(m11);
     TENSOR_DUMP(m2);
 
+    auto compute_plan_buffer = std::vector<uint8_t>();
+
     {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
+        auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
         if (plan.work_size > 0) {
-            plan.work_data = malloc(plan.work_size);
-            GGML_ASSERT(plan.work_data);
+            compute_plan_buffer.resize(plan.work_size);
+            plan.work_data = compute_plan_buffer.data();
         }
         ggml_graph_compute(&plan, &gf);
-        if (plan.work_data) {
-            free(plan.work_data);
-        }
     }
 
     TENSOR_DUMP(gf.nodes[0]);
@@ -229,15 +228,12 @@ int main(int argc, char ** argv)  {
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
         {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
+            auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
             if (plan.work_size > 0) {
-                plan.work_data = malloc(plan.work_size);
-                GGML_ASSERT(plan.work_data);
+                compute_plan_buffer.resize(plan.work_size);
+                plan.work_data = compute_plan_buffer.data();
             }
             ggml_graph_compute(&plan, &gf31);
-            if (plan.work_data) {
-                free(plan.work_data);
-            }
         }
 
         long long int stop = ggml_time_us();
@@ -272,15 +268,12 @@ int main(int argc, char ** argv)  {
 
         // Running a different graph computation to make sure we override the CPU cache lines
         {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
+            auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
             if (plan.work_size > 0) {
-                plan.work_data = malloc(plan.work_size);
-                GGML_ASSERT(plan.work_data);
+                compute_plan_buffer.resize(plan.work_size);
+                plan.work_data = compute_plan_buffer.data();
             }
             ggml_graph_compute(&plan, &gf32);
-            if (plan.work_data) {
-                free(plan.work_data);
-            }
         }
     }
     printf("\n");
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 83da31531da57..0345b8dc02748 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3181,6 +3181,8 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
     }
 
+    auto compute_plan_buffer = std::vector<uint8_t>();
+
     printf("%s: begin training\n", __func__);
 
     for (int ex = 0; ex < params.n_examples; ++ex) {
@@ -3244,15 +3246,12 @@ int main(int argc, char ** argv) {
         }
 
         {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
+            auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
             if (plan.work_size > 0) {
-                plan.work_data = malloc(plan.work_size);
-                GGML_ASSERT(plan.work_data);
+                compute_plan_buffer.resize(plan.work_size);
+                plan.work_data = compute_plan_buffer.data();
             }
             ggml_graph_compute(&plan, gf);
-            if (plan.work_data) {
-                free(plan.work_data);
-            }
         }
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
@@ -3278,15 +3277,12 @@ int main(int argc, char ** argv) {
         model.train_tokens  += n_batch * n_tokens;
 
         {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(gf, params.n_threads);
+            auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
             if (plan.work_size > 0) {
-                plan.work_data = malloc(plan.work_size);
-                GGML_ASSERT(plan.work_data);
+                compute_plan_buffer.resize(plan.work_size);
+                plan.work_data = compute_plan_buffer.data();
             }
             ggml_graph_compute(&plan, gf);
-            if (plan.work_data) {
-                free(plan.work_data);
-            }
         }
 
         float error_after_opt = ggml_get_f32_1d(loss, 0);
@@ -3376,15 +3372,12 @@ int main(int argc, char ** argv) {
             ggml_build_forward_expand(&gf, logits);
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
+                auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
                 if (plan.work_size > 0) {
-                    plan.work_data = malloc(plan.work_size);
-                    GGML_ASSERT(plan.work_data);
+                    compute_plan_buffer.resize(plan.work_size);
+                    plan.work_data = compute_plan_buffer.data();
                 }
                 ggml_graph_compute(&plan, &gf);
-                if (plan.work_data) {
-                    free(plan.work_data);
-                }
             }
 
             //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
diff --git a/ggml.c b/ggml.c
index 0e906d0c3dde7..94a71070612cc 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15974,7 +15974,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     const struct ggml_cgraph * cgraph = state->shared->cgraph;
 
     const struct ggml_graph_compute_plan * plan = state->shared->plan;
-    const int *n_tasks_arr = plan->n_tasks;
+    const int * n_tasks_arr = plan->n_tasks;
 
     const int n_threads = state->shared->n_threads;
     set_numa_thread_affinity(state->ith, n_threads);
@@ -16490,6 +16490,7 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
     }
 }
 
+// TODO: avoid allocating memory frequently.
 static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) {
     struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads);
     if (plan.work_size > 0) {
diff --git a/ggml.h b/ggml.h
index 0f1bd138bff29..1b50ab8666ed6 100644
--- a/ggml.h
+++ b/ggml.h
@@ -449,7 +449,7 @@ extern "C" {
         // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
         size_t work_size;
         // Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
-        void * work_data;
+        uint8_t * work_data;
 
         int n_threads;
 
diff --git a/llama.cpp b/llama.cpp
index d1ae57298b3eb..c29d46d8dd596 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -321,6 +321,10 @@ struct llama_context {
     // input embedding (1-dimensional array: [n_embd])
     std::vector<float> embedding;
 
+    // reusable buffer for `struct ggml_graph_compute_plan.work_data`
+    // std::vector guarantees the elements are stored contiguously.
+    std::vector<uint8_t> compute_plan_buffer;
+
     // memory buffers used to evaluate the model
     // TODO: move in llama_state
     llama_ctx_buffer buf_compute;
@@ -1591,10 +1595,13 @@ static bool llama_eval_internal(
     // run the computation
     ggml_build_forward_expand(&gf, cur);
 
+    bool call_ggml_graph_compute = true;
+
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal && N == 1) {
         ggml_metal_graph_compute(lctx.ctx_metal, &gf);
         ggml_metal_get_tensor   (lctx.ctx_metal, cur);
+        call_ggml_graph_compute = false;
     } else {
         // IMPORTANT:
         // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1611,32 +1618,17 @@ static bool llama_eval_internal(
             ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
             ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
         }
-
-        {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
-            if (plan.work_size > 0) {
-                plan.work_data = malloc(plan.work_size);
-                GGML_ASSERT(plan.work_data);
-            }
-            ggml_graph_compute(&plan, &gf);
-            if (plan.work_data) {
-                free(plan.work_data);
-            }
-        }
     }
-#else
-    {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
+#endif
+
+    if (call_ggml_graph_compute) {
+        auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
         if (plan.work_size > 0) {
-            plan.work_data = malloc(plan.work_size);
-            GGML_ASSERT(plan.work_data);
+            lctx.compute_plan_buffer.resize(plan.work_size);
+            plan.work_data = lctx.compute_plan_buffer.data();
         }
         ggml_graph_compute(&plan, &gf);
-        if (plan.work_data) {
-            free(plan.work_data);
-        }
     }
-#endif
 
     if (cgraph_fname) {
         ggml_graph_export(&gf, cgraph_fname);
@@ -2822,6 +2814,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
     // read tensors and apply
     bool warned = false;
     int n_tensors = 0;
+
+    auto compute_plan_buffer = std::vector<uint8_t>();
+
     while (true) {
         int32_t n_dims;
         int32_t length;
@@ -2988,15 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
             struct ggml_cgraph gf = ggml_build_forward(r);
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
+                auto plan = ggml_graph_compute_make_plan(&gf, n_threads);
                 if (plan.work_size > 0) {
-                    plan.work_data = malloc(plan.work_size);
-                    GGML_ASSERT(plan.work_data);
+                    compute_plan_buffer.resize(plan.work_size);
+                    plan.work_data = compute_plan_buffer.data();
                 }
                 ggml_graph_compute(&plan, &gf);
-                if (plan.work_data) {
-                    free(plan.work_data);
-                }
             }
 
             // we won't need these tensors again, reset the context to save memory
@@ -3171,15 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+                auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
                 if (plan.work_size > 0) {
-                    plan.work_data = malloc(plan.work_size);
-                    GGML_ASSERT(plan.work_data);
+                    ctx->compute_plan_buffer.resize(plan.work_size);
+                    plan.work_data = ctx->compute_plan_buffer.data();
                 }
                 ggml_graph_compute(&plan, &gf);
-                if (plan.work_data) {
-                    free(plan.work_data);
-                }
             }
 
             ggml_free(cpy_ctx);
@@ -3287,15 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
+                auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
                 if (plan.work_size > 0) {
-                    plan.work_data = malloc(plan.work_size);
-                    GGML_ASSERT(plan.work_data);
+                    ctx->compute_plan_buffer.resize(plan.work_size);
+                    plan.work_data = ctx->compute_plan_buffer.data();
                 }
                 ggml_graph_compute(&plan, &gf);
-                if (plan.work_data) {
-                    free(plan.work_data);
-                }
             }
 
             ggml_free(cpy_ctx);
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 477fedfeef63e..548547727efdc 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -191,6 +191,32 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
 
 }
 
+struct compute_plan_buffer {
+    size_t    size;
+    uint8_t * data;
+};
+
+static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) {
+    if (size == 0) {
+        return NULL;
+    }
+
+    GGML_ASSERT(buf);
+
+    if (buf->size == 0) {
+        buf->data = malloc(size);
+        buf->size = size;
+    } else if (buf->size < size) {
+        buf->data = realloc(buf->data, size);
+        buf->size = size;
+    } else {
+        // skip shrinking.
+    }
+
+    GGML_ASSERT(buf->data);
+    return buf->data;
+}
+
 bool check_gradient(
         const char * op_name,
         struct ggml_context * ctx0,
@@ -218,6 +244,8 @@ bool check_gradient(
 
     struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
 
+    struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL };
+
     {
         struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
         if (plan.work_size > 0) {
@@ -235,14 +263,8 @@ bool check_gradient(
 
     {
         struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
-        if (plan.work_size > 0) {
-            plan.work_data = malloc(plan.work_size);
-            GGML_ASSERT(plan.work_data);
-        }
+        plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
         ggml_graph_compute(&plan, &gb);
-        if (plan.work_data) {
-            free(plan.work_data);
-        }
     }
 
     // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
@@ -259,14 +281,8 @@ bool check_gradient(
 
             {
                 struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
-                if (plan.work_size > 0) {
-                    plan.work_data = malloc(plan.work_size);
-                    GGML_ASSERT(plan.work_data);
-                }
+                plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
                 ggml_graph_compute(&plan, &gf);
-                if (plan.work_data) {
-                    free(plan.work_data);
-                }
             }
 
             const float f0 = ggml_get_f32_1d(f, 0);
@@ -275,14 +291,8 @@ bool check_gradient(
 
             {
                 struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
-                if (plan.work_size > 0) {
-                    plan.work_data = malloc(plan.work_size);
-                    GGML_ASSERT(plan.work_data);
-                }
+                plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
                 ggml_graph_compute(&plan, &gf);
-                if (plan.work_data) {
-                    free(plan.work_data);
-                }
             }
 
             const float f1 = ggml_get_f32_1d(f, 0);
@@ -297,14 +307,8 @@ bool check_gradient(
 
             {
                 struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
-                if (plan.work_size > 0) {
-                    plan.work_data = malloc(plan.work_size);
-                    GGML_ASSERT(plan.work_data);
-                }
+                plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
                 ggml_graph_compute(&plan, &gb);
-                if (plan.work_data) {
-                    free(plan.work_data);
-                }
             }
 
             const float g1 = get_element(x[i]->grad, k);
@@ -321,6 +325,10 @@ bool check_gradient(
         }
     }
 
+    if (plan_buf.data) {
+        free(plan_buf.data);
+    }
+
     return true;
 }
 
diff --git a/tests/test-opt.c b/tests/test-opt.c
index cb0d58199991a..35d070dc7a095 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -114,6 +114,31 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
     ((float *)t->data)[idx] = value;
 }
 
+
+struct compute_plan_buffer {
+    size_t    size;
+    uint8_t * data;
+};
+
+static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) {
+    if (size == 0) {
+        return NULL;
+    }
+
+    if (buf->size == 0) {
+        buf->data = malloc(size);
+        buf->size = size;
+    } else if (buf->size < size) {
+        buf->data = realloc(buf->data, size);
+        buf->size = size;
+    } else {
+        // skip shrinking.
+    }
+
+    GGML_ASSERT(buf->data);
+    return buf->data;
+}
+
 int main(int argc, const char ** argv) {
     struct ggml_init_params params = {
         .mem_size   = 1024*1024*1024,
@@ -141,16 +166,11 @@ int main(int argc, const char ** argv) {
     struct ggml_cgraph ge = ggml_build_forward(e);
     ggml_graph_reset  (&ge);
 
+    struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL };
     {
         struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
-        if (plan.work_size > 0) {
-            plan.work_data = malloc(plan.work_size);
-            GGML_ASSERT(plan.work_data);
-        }
+        plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
         ggml_graph_compute(&plan, &ge);
-        if (plan.work_data) {
-            free(plan.work_data);
-        }
     }
 
     const float fe = ggml_get_f32_1d(e, 0);
@@ -164,14 +184,12 @@ int main(int argc, const char ** argv) {
 
     {
         struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
-        if (plan.work_size > 0) {
-            plan.work_data = malloc(plan.work_size);
-            GGML_ASSERT(plan.work_data);
-        }
+        plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
         ggml_graph_compute(&plan, &ge);
-        if (plan.work_data) {
-            free(plan.work_data);
-        }
+    }
+
+    if (plan_buf.data) {
+        free(plan_buf.data);
     }
 
     const float fe_opt = ggml_get_f32_1d(e, 0);

From 53cfb4b9957a54f20f25089da40aa9718e41aad7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 20:23:08 +0300
Subject: [PATCH 08/20] ggml : more consistent naming + metal fixes

---
 examples/baby-llama/baby-llama.cpp            | 32 ++++----
 examples/benchmark/benchmark-matmult.cpp      | 32 ++++----
 examples/metal/metal.cpp                      |  3 +-
 .../train-text-from-scratch.cpp               | 34 ++++----
 ggml-metal.h                                  |  6 +-
 ggml-metal.m                                  | 11 ++-
 ggml.c                                        | 77 ++++++++++---------
 ggml.h                                        | 24 +++---
 llama.cpp                                     | 54 ++++++-------
 tests/CMakeLists.txt                          |  4 +-
 tests/test-grad0.c                            | 65 ++++++++--------
 tests/test-opt.c                              | 27 +++----
 12 files changed, 194 insertions(+), 175 deletions(-)

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 785e7e8860fff..5d66089b1e22e 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -1569,7 +1569,7 @@ int main(int argc, char ** argv) {
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
 
-    auto compute_plan_buffer = std::vector<uint8_t>();
+    std::vector<uint8_t> work_buffer;
 
     for (int ex=0; ex<n_examples; ++ex) {
         struct ggml_init_params params = {
@@ -1598,12 +1598,12 @@ int main(int argc, char ** argv) {
         ggml_build_forward_expand(&gf, e);
 
         {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
+            if (pf.work_size > 0) {
+                work_buffer.resize(pf.work_size);
+                pf.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, &gf);
+            ggml_graph_compute(&gf, &pf);
         }
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
@@ -1622,12 +1622,12 @@ int main(int argc, char ** argv) {
         ggml_build_forward_expand(&gf, e);
 
         {
-            struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
+            if (pf.work_size > 0) {
+                work_buffer.resize(pf.work_size);
+                pf.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, &gf);
+            ggml_graph_compute(&gf, &pf);
         }
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
@@ -1683,12 +1683,12 @@ int main(int argc, char ** argv) {
             ggml_build_forward_expand(&gf, logits);
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
-                if (plan.work_size > 0) {
-                    compute_plan_buffer.resize(plan.work_size);
-                    plan.work_data = compute_plan_buffer.data();
+                struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
+                if (pf.work_size > 0) {
+                    work_buffer.resize(pf.work_size);
+                    pf.work_data = work_buffer.data();
                 }
-                ggml_graph_compute(&plan, &gf);
+                ggml_graph_compute(&gf, &pf);
             }
 
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index e7d75c9ae51e0..840f4fe525cfb 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -164,15 +164,15 @@ int main(int argc, char ** argv)  {
     TENSOR_DUMP(m11);
     TENSOR_DUMP(m2);
 
-    auto compute_plan_buffer = std::vector<uint8_t>();
+    std::vector<uint8_t> work_buffer;
 
     {
-        auto plan = ggml_graph_compute_make_plan(&gf, benchmark_params.n_threads);
-        if (plan.work_size > 0) {
-            compute_plan_buffer.resize(plan.work_size);
-            plan.work_data = compute_plan_buffer.data();
+        ggml_cplan pf = ggml_graph_plan(&gf, benchmark_params.n_threads);
+        if (pf.work_size > 0) {
+            work_buffer.resize(pf.work_size);
+            pf.work_data = work_buffer.data();
         }
-        ggml_graph_compute(&plan, &gf);
+        ggml_graph_compute(&gf, &pf);
     }
 
     TENSOR_DUMP(gf.nodes[0]);
@@ -228,12 +228,12 @@ int main(int argc, char ** argv)  {
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
         {
-            auto plan = ggml_graph_compute_make_plan(&gf31, benchmark_params.n_threads);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            ggml_cplan pf31 = ggml_graph_plan(&gf31, benchmark_params.n_threads);
+            if (pf31.work_size > 0) {
+                work_buffer.resize(pf31.work_size);
+                pf31.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, &gf31);
+            ggml_graph_compute(&gf31, &pf31);
         }
 
         long long int stop = ggml_time_us();
@@ -268,12 +268,12 @@ int main(int argc, char ** argv)  {
 
         // Running a different graph computation to make sure we override the CPU cache lines
         {
-            auto plan = ggml_graph_compute_make_plan(&gf32, benchmark_params.n_threads);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            ggml_cplan pf32 = ggml_graph_plan(&gf32, benchmark_params.n_threads);
+            if (pf32.work_size > 0) {
+                work_buffer.resize(pf32.work_size);
+                pf32.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, &gf32);
+            ggml_graph_compute(&gf32, &pf32);
         }
     }
     printf("\n");
diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp
index cdfe4bfe97865..7438defdefcdf 100644
--- a/examples/metal/metal.cpp
+++ b/examples/metal/metal.cpp
@@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
     struct ggml_context * ctx_eval = NULL;
 
     struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
-    gf.n_threads = 1;
 
     // this allocates all Metal resources and memory buffers
-    auto * ctx_metal = ggml_metal_init();
+    auto * ctx_metal = ggml_metal_init(1);
 
     const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
     const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 0345b8dc02748..11ffbe2e1e3a1 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -3160,6 +3160,7 @@ int main(int argc, char ** argv) {
     printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
     // ggml_print_tensor_objects(model.ctx);
 
+    // TODO: use std::vector<uint8_t> intead of "new"
     size_t    compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
     uint8_t * compute_addr = new uint8_t[compute_size];
 
@@ -3181,7 +3182,7 @@ int main(int argc, char ** argv) {
         GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
     }
 
-    auto compute_plan_buffer = std::vector<uint8_t>();
+    std::vector<uint8_t> work_buffer;
 
     printf("%s: begin training\n", __func__);
 
@@ -3246,12 +3247,12 @@ int main(int argc, char ** argv) {
         }
 
         {
-            auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            ggml_cplan pf = ggml_graph_plan(gf, params.n_threads);
+            if (pf.work_size > 0) {
+                work_buffer.resize(pf.work_size);
+                pf.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, gf);
+            ggml_graph_compute(gf, &pf);
         }
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
@@ -3277,12 +3278,12 @@ int main(int argc, char ** argv) {
         model.train_tokens  += n_batch * n_tokens;
 
         {
-            auto plan = ggml_graph_compute_make_plan(gf, params.n_threads);
-            if (plan.work_size > 0) {
-                compute_plan_buffer.resize(plan.work_size);
-                plan.work_data = compute_plan_buffer.data();
+            ggml_cplan pf = ggml_graph_plan(gf, params.n_threads);
+            if (pf.work_size > 0) {
+                work_buffer.resize(pf.work_size);
+                pf.work_data = work_buffer.data();
             }
-            ggml_graph_compute(&plan, gf);
+            ggml_graph_compute(gf, &pf);
         }
 
         float error_after_opt = ggml_get_f32_1d(loss, 0);
@@ -3372,12 +3373,12 @@ int main(int argc, char ** argv) {
             ggml_build_forward_expand(&gf, logits);
 
             {
-                auto plan = ggml_graph_compute_make_plan(&gf, params.n_threads);
-                if (plan.work_size > 0) {
-                    compute_plan_buffer.resize(plan.work_size);
-                    plan.work_data = compute_plan_buffer.data();
+                ggml_cplan pf = ggml_graph_plan(&gf, params.n_threads);
+                if (pf.work_size > 0) {
+                    work_buffer.resize(pf.work_size);
+                    pf.work_data = work_buffer.data();
                 }
-                ggml_graph_compute(&plan, &gf);
+                ggml_graph_compute(&gf, &pf);
             }
 
             //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
@@ -3404,6 +3405,7 @@ int main(int argc, char ** argv) {
     delete[] compute_addr;
     delete[] compute_buf_0;
     delete[] compute_buf_1;
+
     llama_free(lctx);
     llama_free_model(lmodel);
     ggml_free(model.ctx);
diff --git a/ggml-metal.h b/ggml-metal.h
index b9e50ac745eb0..928f1705c381c 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -34,9 +34,13 @@ extern "C" {
 
 struct ggml_metal_context;
 
-struct ggml_metal_context * ggml_metal_init(void);
+// number of command buffers to use
+struct ggml_metal_context * ggml_metal_init(int n_cb);
 void ggml_metal_free(struct ggml_metal_context * ctx);
 
+// set the number of command buffers to use
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+
 // creates a mapping between a host memory buffer and a device memory buffer
 // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
 // - the mapping is used during computation to determine the arguments of the compute kernels
diff --git a/ggml-metal.m b/ggml-metal.m
index fd69c41fe357d..3f15f791f9f65 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -25,6 +25,8 @@
 };
 
 struct ggml_metal_context {
+    int n_cb;
+
     float * logits;
 
     id<MTLDevice>       device;
@@ -86,11 +88,12 @@ @interface GGMLMetalClass : NSObject
 @implementation GGMLMetalClass
 @end
 
-struct ggml_metal_context * ggml_metal_init(void) {
+struct ggml_metal_context * ggml_metal_init(int n_cb) {
     fprintf(stderr, "%s: allocating\n", __func__);
 
     struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
 
+    ctx->n_cb   = n_cb;
     ctx->device = MTLCreateSystemDefaultDevice();
     ctx->queue  = [ctx->device newCommandQueue];
     ctx->n_buffers = 0;
@@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
     free(ctx);
 }
 
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
+    ctx->n_cb = n_cb;
+}
+
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
 
-    const int n_cb = gf->n_threads;
+    const int n_cb = ctx->n_cb;
 
     NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
 
diff --git a/ggml.c b/ggml.c
index 94a71070612cc..23938fc5f27b2 100644
--- a/ggml.c
+++ b/ggml.c
@@ -15942,7 +15942,7 @@ void clear_numa_thread_affinity(void) {}
 
 struct ggml_compute_state_shared {
     const struct ggml_cgraph * cgraph;
-    const struct ggml_graph_compute_plan * plan;
+    const struct ggml_cplan  * cplan;
 
     int64_t perf_node_start_cycles;
     int64_t perf_node_start_time_us;
@@ -15971,12 +15971,13 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
+
     const struct ggml_cgraph * cgraph = state->shared->cgraph;
+    const struct ggml_cplan  * cplan  = state->shared->cplan;
 
-    const struct ggml_graph_compute_plan * plan = state->shared->plan;
-    const int * n_tasks_arr = plan->n_tasks;
+    const int * n_tasks_arr = cplan->n_tasks;
+    const int   n_threads   = state->shared->n_threads;
 
-    const int n_threads = state->shared->n_threads;
     set_numa_thread_affinity(state->ith, n_threads);
 
     int node_n = -1;
@@ -15989,8 +15990,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 /*.type  =*/ GGML_TASK_FINALIZE,
                 /*.ith   =*/ 0,
                 /*.nth   =*/ 0,
-                /*.wsize =*/ plan->work_size,
-                /*.wdata =*/ plan->work_data,
+                /*.wsize =*/ cplan->work_size,
+                /*.wdata =*/ cplan->work_data,
             };
 
             if (node_n != -1) {
@@ -16059,8 +16060,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             /*.type  =*/ GGML_TASK_COMPUTE,
             /*.ith   =*/ state->ith,
             /*.nth   =*/ n_tasks,
-            /*.wsize =*/ plan->work_size,
-            /*.wdata =*/ plan->work_data,
+            /*.wsize =*/ cplan->work_size,
+            /*.wdata =*/ cplan->work_data,
         };
 
         if (state->ith < n_tasks) {
@@ -16072,14 +16073,16 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }
 
 // Prepare for graph computing.
-struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, int n_threads) {
+struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
     if (n_threads <= 0) {
         n_threads = GGML_DEFAULT_N_THREADS;
     }
 
-    struct ggml_graph_compute_plan plan;
-    memset(&plan, 0, sizeof(struct ggml_graph_compute_plan));
-    int * n_tasks = plan.n_tasks;
+    struct ggml_cplan cplan;
+    memset(&cplan, 0, sizeof(struct ggml_cplan));
+
+    int * n_tasks = cplan.n_tasks;
+
     size_t work_size = 0;
 
     // initialize tasks + work buffer
@@ -16403,34 +16406,34 @@ struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph *
         work_size += CACHE_LINE_SIZE*(n_threads - 1);
     }
 
-    plan.n_threads = n_threads;
-    plan.work_size = work_size;
-    plan.work_data = NULL;
+    cplan.n_threads = n_threads;
+    cplan.work_size = work_size;
+    cplan.work_data = NULL;
 
-    return plan;
+    return cplan;
 }
 
-void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph) {
+void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
     {
-        GGML_ASSERT(plan);
-        GGML_ASSERT(plan->n_threads > 0);
+        GGML_ASSERT(cplan);
+        GGML_ASSERT(cplan->n_threads > 0);
 
-        if (plan->work_size > 0) {
-            GGML_ASSERT(plan->work_data);
+        if (cplan->work_size > 0) {
+            GGML_ASSERT(cplan->work_data);
         }
 
         for (int i = 0; i < cgraph->n_nodes; ++i) {
             if (cgraph->nodes[i]->op != GGML_OP_NONE) {
-                GGML_ASSERT(plan->n_tasks[i] > 0);
+                GGML_ASSERT(cplan->n_tasks[i] > 0);
             }
         }
     }
 
-    const int n_threads = plan->n_threads;
+    const int n_threads = cplan->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
         /*.cgraph                  =*/ cgraph,
-        /*.cgraph_plan             =*/ plan,
+        /*.cgraph_plan             =*/ cplan,
         /*.perf_node_start_cycles  =*/ 0,
         /*.perf_node_start_time_us =*/ 0,
         /*.n_threads               =*/ n_threads,
@@ -16491,17 +16494,19 @@ void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgrap
 }
 
 // TODO: avoid allocating memory frequently.
-static void ggml_graph_compute_sugar(struct ggml_cgraph * cgraph, int n_threads) {
-    struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(cgraph, n_threads);
-    if (plan.work_size > 0) {
-        plan.work_data = malloc(plan.work_size);
-        GGML_ASSERT(plan.work_data);
+// TODO: make part of public API - use different name and put warning that it makes allocations
+static void ggml_graph_compute_helper(struct ggml_cgraph * cgraph, int n_threads) {
+    struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
+
+    if (cplan.work_size > 0) {
+        cplan.work_data = malloc(cplan.work_size);
+        GGML_ASSERT(cplan.work_data);
     }
 
-    ggml_graph_compute(&plan, cgraph);
+    ggml_graph_compute(cgraph, &cplan);
 
-    if (plan.work_data) {
-        free(plan.work_data);
+    if (cplan.work_data) {
+        free(cplan.work_data);
     }
 }
 
@@ -17341,7 +17346,7 @@ static enum ggml_opt_result ggml_opt_adam(
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
 
-    ggml_graph_compute_sugar(gb, params.n_threads);
+    ggml_graph_compute_helper(gb, params.n_threads);
 
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
@@ -17422,7 +17427,7 @@ static enum ggml_opt_result ggml_opt_adam(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        ggml_graph_compute_sugar(gb, params.n_threads);
+        ggml_graph_compute_helper(gb, params.n_threads);
 
         const float fx = ggml_get_f32_1d(f, 0);
 
@@ -17544,7 +17549,7 @@ static enum ggml_opt_result linesearch_backtracking(
             ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
 
-            ggml_graph_compute_sugar(gb, params->n_threads);
+            ggml_graph_compute_helper(gb, params->n_threads);
 
             ggml_opt_get_grad(np, ps, g);
 
@@ -17664,7 +17669,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        ggml_graph_compute_sugar(gb, params.n_threads);
+        ggml_graph_compute_helper(gb, params.n_threads);
 
         ggml_opt_get_grad(np, ps, g);
 
diff --git a/ggml.h b/ggml.h
index 1b50ab8666ed6..901c701ea866f 100644
--- a/ggml.h
+++ b/ggml.h
@@ -443,17 +443,15 @@ extern "C" {
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 
-    // The default graph compute plan that needs to be prepared for ggml_graph_compute().
-    // Since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_graph_compute_plan {
-        // Size of work buffer, calculated by `ggml_graph_compute_make_plan()`.
-        size_t work_size;
-        // Work buffer, to be allocated by caller before calling to `ggml_graph_compute()`.
-        uint8_t * work_data;
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
 
         int n_threads;
 
-        // The `n_tasks` of nodes, 1:1 mapping to cgraph nodes.
+        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
         int n_tasks[GGML_MAX_NODES];
     };
 
@@ -1313,11 +1311,11 @@ extern "C" {
     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
 
-    // ggml_graph_compute_make_plan() needs to be called before ggml_graph_compute().
-    // Returns a plan object. When plan.work_size > 0, caller must allocate memory for plan.work_data.
-    GGML_API struct ggml_graph_compute_plan ggml_graph_compute_make_plan(struct ggml_cgraph * cgraph, const int n_threads/*=GGML_DEFAULT_N_THREADS*/);
-    GGML_API void ggml_graph_compute(struct ggml_graph_compute_plan * plan, struct ggml_cgraph * cgraph);
-    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API              void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 
     GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
 
diff --git a/llama.cpp b/llama.cpp
index c29d46d8dd596..e68beb7c5b8b4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -321,9 +321,8 @@ struct llama_context {
     // input embedding (1-dimensional array: [n_embd])
     std::vector<float> embedding;
 
-    // reusable buffer for `struct ggml_graph_compute_plan.work_data`
-    // std::vector guarantees the elements are stored contiguously.
-    std::vector<uint8_t> compute_plan_buffer;
+    // reusable buffer for `struct ggml_graph_plan.work_data`
+    std::vector<uint8_t> work_buffer;
 
     // memory buffers used to evaluate the model
     // TODO: move in llama_state
@@ -1599,6 +1598,7 @@ static bool llama_eval_internal(
 
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal && N == 1) {
+        ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
         ggml_metal_graph_compute(lctx.ctx_metal, &gf);
         ggml_metal_get_tensor   (lctx.ctx_metal, cur);
         call_ggml_graph_compute = false;
@@ -1622,12 +1622,12 @@ static bool llama_eval_internal(
 #endif
 
     if (call_ggml_graph_compute) {
-        auto plan = ggml_graph_compute_make_plan(&gf, actual_n_threads);
-        if (plan.work_size > 0) {
-            lctx.compute_plan_buffer.resize(plan.work_size);
-            plan.work_data = lctx.compute_plan_buffer.data();
+        ggml_cplan pf = ggml_graph_plan(&gf, actual_n_threads);
+        if (pf.work_size > 0) {
+            lctx.work_buffer.resize(pf.work_size);
+            pf.work_data = lctx.work_buffer.data();
         }
-        ggml_graph_compute(&plan, &gf);
+        ggml_graph_compute(&gf, &pf);
     }
 
     if (cgraph_fname) {
@@ -2587,8 +2587,8 @@ void llama_free_model(struct llama_model * model) {
 }
 
 struct llama_context * llama_new_context_with_model(
-                             struct llama_model * model,
-            struct llama_context_params   params) {
+                 struct llama_model * model,
+        struct llama_context_params   params) {
 
     if (!model) {
         return nullptr;
@@ -2657,7 +2657,7 @@ struct llama_context * llama_new_context_with_model(
 #ifdef GGML_USE_METAL
     if (params.n_gpu_layers > 0) {
         // this allocates all Metal resources and memory buffers
-        ctx->ctx_metal = ggml_metal_init();
+        ctx->ctx_metal = ggml_metal_init(1);
 
         void * data_ptr  = NULL;
         size_t data_size = 0;
@@ -2815,7 +2815,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
     bool warned = false;
     int n_tensors = 0;
 
-    auto compute_plan_buffer = std::vector<uint8_t>();
+    std::vector<uint8_t> work_buffer;
 
     while (true) {
         int32_t n_dims;
@@ -2983,12 +2983,12 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
             struct ggml_cgraph gf = ggml_build_forward(r);
 
             {
-                auto plan = ggml_graph_compute_make_plan(&gf, n_threads);
-                if (plan.work_size > 0) {
-                    compute_plan_buffer.resize(plan.work_size);
-                    plan.work_data = compute_plan_buffer.data();
+                ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
+                if (pf.work_size > 0) {
+                    work_buffer.resize(pf.work_size);
+                    pf.work_data = work_buffer.data();
                 }
-                ggml_graph_compute(&plan, &gf);
+                ggml_graph_compute(&gf, &pf);
             }
 
             // we won't need these tensors again, reset the context to save memory
@@ -3163,12 +3163,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
 
             {
-                auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
-                if (plan.work_size > 0) {
-                    ctx->compute_plan_buffer.resize(plan.work_size);
-                    plan.work_data = ctx->compute_plan_buffer.data();
+                ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
+                if (pf.work_size > 0) {
+                    ctx->work_buffer.resize(pf.work_size);
+                    pf.work_data = ctx->work_buffer.data();
                 }
-                ggml_graph_compute(&plan, &gf);
+                ggml_graph_compute(&gf, &pf);
             }
 
             ggml_free(cpy_ctx);
@@ -3276,12 +3276,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
 
             {
-                auto plan = ggml_graph_compute_make_plan(&gf, /*n_threads*/ 1);
-                if (plan.work_size > 0) {
-                    ctx->compute_plan_buffer.resize(plan.work_size);
-                    plan.work_data = ctx->compute_plan_buffer.data();
+                ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
+                if (pf.work_size > 0) {
+                    ctx->work_buffer.resize(pf.work_size);
+                    pf.work_data = ctx->work_buffer.data();
                 }
-                ggml_graph_compute(&plan, &gf);
+                ggml_graph_compute(&gf, &pf);
             }
 
             ggml_free(cpy_ctx);
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4171c126c7b7d..dd989c5c041f7 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-# llama_add_test(test-grad0.c) # SLOW
-# llama_add_test(test-opt.c) # SLOW
+llama_add_test(test-grad0.c) # SLOW
+llama_add_test(test-opt.c) # SLOW
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 548547727efdc..9c27e603e6bee 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -10,6 +10,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+
 #define MAX_NARGS 3
 
 #undef MIN
@@ -49,7 +51,7 @@ float frand(void) {
 
 int irand(int n) {
     if (n == 0) return 0;
-    else return rand()%n;
+    return rand()%n;
 }
 
 void get_random_dims(int64_t * dims, int ndims) {
@@ -159,12 +161,14 @@ struct ggml_tensor * get_random_tensor_int(
 float get_element(const struct ggml_tensor * t, int idx) {
     if (t->type == GGML_TYPE_F32) {
         return ((float *)t->data)[idx];
-    } else if (t->type == GGML_TYPE_I32) {
+    }
+
+    if (t->type == GGML_TYPE_I32) {
         return ((int32_t *)t->data)[idx];
-    } else {
-        assert(false);
-        return INFINITY;
     }
+
+    assert(false);
+    return INFINITY;
 }
 
 void set_element(struct ggml_tensor * t, int idx, float value) {
@@ -191,12 +195,12 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
 
 }
 
-struct compute_plan_buffer {
+struct work_buffer {
     size_t    size;
     uint8_t * data;
 };
 
-static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) {
+static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) {
     if (size == 0) {
         return NULL;
     }
@@ -241,20 +245,19 @@ bool check_gradient(
     }
 
     struct ggml_cgraph gf = ggml_build_forward (f);
-
     struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
 
-    struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL };
+    struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL };
 
     {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
-        if (plan.work_size > 0) {
-            plan.work_data = malloc(plan.work_size);
-            GGML_ASSERT(plan.work_data);
+        struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
+        if (pf.work_size > 0) {
+            pf.work_data = malloc(pf.work_size);
+            GGML_ASSERT(pf.work_data);
         }
-        ggml_graph_compute(&plan, &gf);
-        if (plan.work_data) {
-            free(plan.work_data);
+        ggml_graph_compute(&gf, &pf);
+        if (pf.work_data) {
+            free(pf.work_data);
         }
     }
 
@@ -262,9 +265,9 @@ bool check_gradient(
     ggml_set_f32      (f->grad, 1.0f);
 
     {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
-        plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-        ggml_graph_compute(&plan, &gb);
+        struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads);
+        pf.work_data = work_buffer_resize(&buf, pf.work_size);
+        ggml_graph_compute(&gf, &pf);
     }
 
     // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
@@ -280,9 +283,9 @@ bool check_gradient(
             set_element(x[i], k, xp);
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
-                plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-                ggml_graph_compute(&plan, &gf);
+                struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
+                pf.work_data = work_buffer_resize(&buf, pf.work_size);
+                ggml_graph_compute(&gf, &pf);
             }
 
             const float f0 = ggml_get_f32_1d(f, 0);
@@ -290,9 +293,9 @@ bool check_gradient(
             set_element(x[i], k, xm);
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
-                plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-                ggml_graph_compute(&plan, &gf);
+                struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
+                pf.work_data = work_buffer_resize(&buf, pf.work_size);
+                ggml_graph_compute(&gf, &pf);
             }
 
             const float f1 = ggml_get_f32_1d(f, 0);
@@ -306,15 +309,15 @@ bool check_gradient(
             ggml_set_f32      (f->grad, 1.0f);
 
             {
-                struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gb, n_threads);
-                plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-                ggml_graph_compute(&plan, &gb);
+                struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads);
+                pf.work_data = work_buffer_resize(&buf, pf.work_size);
+                ggml_graph_compute(&gf, &pf);
             }
 
             const float g1 = get_element(x[i]->grad, k);
 
             const float error_abs = fabsf(g0 - g1);
-            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0;
+            const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
 
             if (error_abs > max_error_abs || error_rel > max_error_rel) {
                 printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
@@ -325,8 +328,8 @@ bool check_gradient(
         }
     }
 
-    if (plan_buf.data) {
-        free(plan_buf.data);
+    if (buf.data) {
+        free(buf.data);
     }
 
     return true;
diff --git a/tests/test-opt.c b/tests/test-opt.c
index 35d070dc7a095..3ed246b3b3b65 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -7,6 +7,7 @@
 
 #define MAX_NARGS 2
 
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
 
 //
 // logging
@@ -33,7 +34,7 @@
 #define GGML_PRINT(...) printf(__VA_ARGS__)
 
 
-float frand() {
+float frand(void) {
     return (float)rand()/(float)RAND_MAX;
 }
 
@@ -115,12 +116,12 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
 }
 
 
-struct compute_plan_buffer {
+struct work_buffer {
     size_t    size;
     uint8_t * data;
 };
 
-static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t size) {
+static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) {
     if (size == 0) {
         return NULL;
     }
@@ -139,7 +140,7 @@ static uint8_t * ensure_plan_work_data(struct compute_plan_buffer *buf, size_t s
     return buf->data;
 }
 
-int main(int argc, const char ** argv) {
+int main(void) {
     struct ggml_init_params params = {
         .mem_size   = 1024*1024*1024,
         .mem_buffer = NULL,
@@ -166,11 +167,11 @@ int main(int argc, const char ** argv) {
     struct ggml_cgraph ge = ggml_build_forward(e);
     ggml_graph_reset  (&ge);
 
-    struct compute_plan_buffer plan_buf = { /*.size = */ 0, /*.data =*/ NULL };
+    struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL };
     {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
-        plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-        ggml_graph_compute(&plan, &ge);
+        struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1);
+        pe.work_data = work_buffer_resize(&buf, pe.work_size);
+        ggml_graph_compute(&ge, &pe);
     }
 
     const float fe = ggml_get_f32_1d(e, 0);
@@ -183,13 +184,13 @@ int main(int argc, const char ** argv) {
     ggml_graph_reset  (&ge);
 
     {
-        struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&ge, /*n_threads*/ 1);
-        plan.work_data = ensure_plan_work_data(&plan_buf, plan.work_size);
-        ggml_graph_compute(&plan, &ge);
+        struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1);
+        pe.work_data = work_buffer_resize(&buf, pe.work_size);
+        ggml_graph_compute(&ge, &pe);
     }
 
-    if (plan_buf.data) {
-        free(plan_buf.data);
+    if (buf.data) {
+        free(buf.data);
     }
 
     const float fe_opt = ggml_get_f32_1d(e, 0);

From 4646cc2cf16bdece6ba87b6444fe8b02e87f1c5b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 20:25:27 +0300
Subject: [PATCH 09/20] ggml : fix docs

---
 ggml.h | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/ggml.h b/ggml.h
index 901c701ea866f..78870147c6857 100644
--- a/ggml.h
+++ b/ggml.h
@@ -65,15 +65,17 @@
 //       ggml_set_f32(a, 3.0f);
 //       ggml_set_f32(b, 4.0f);
 //
-//       const int n_threads = 1;
-//       struct ggml_graph_compute_plan plan = ggml_graph_compute_make_plan(&gf, n_threads);
-//       if (plan.work_size > 0) {
-//           plan.work_data = malloc(plan.work_size);
-//           GGML_ASSERT(plan.work_data);
+//       struct ggml_cplan pf = ggml_graph_compute_make_plan(&gf, n_threads);
+//
+//       if (pf.work_size > 0) {
+//           pf.work_data = malloc(pf.work_size);
+//           GGML_ASSERT(pf.work_data);
 //       }
-//       ggml_graph_compute(&plan, &gf);
-//       if (plan.work_data) {
-//           free(plan.work_data);
+//
+//       ggml_graph_compute(&gf, &pf);
+//
+//       if (pf.work_data) {
+//           free(pf.work_data);
 //       }
 //
 //       printf("f = %f\n", ggml_get_f32_1d(f, 0));

From 8e1f0b6865f11a52932278775b040a8c925e247e Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 20:30:40 +0300
Subject: [PATCH 10/20] tests : disable grad / opt + minor naming changes

---
 llama.cpp            | 9 +++++----
 tests/CMakeLists.txt | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index e68beb7c5b8b4..5c9aea9de24fc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1268,7 +1268,7 @@ static bool llama_eval_internal(
            const float * embd,
              const int   n_tokens,
              const int   n_past,
-             const int   n_threads,
+                   int   n_threads,
             const char * cgraph_fname) {
 
     LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
@@ -1309,10 +1309,11 @@ static bool llama_eval_internal(
 
     struct ggml_context * ctx0 = ggml_init(params);
 
+    ggml_cgraph gf = {};
+
     // for big prompts, if BLAS is enabled, it is better to use only one thread
     // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
-    ggml_cgraph gf = {};
-    const int actual_n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+    n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
 
     struct ggml_tensor * cur;
     struct ggml_tensor * inpL;
@@ -1622,7 +1623,7 @@ static bool llama_eval_internal(
 #endif
 
     if (call_ggml_graph_compute) {
-        ggml_cplan pf = ggml_graph_plan(&gf, actual_n_threads);
+        ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
         if (pf.work_size > 0) {
             lctx.work_buffer.resize(pf.work_size);
             pf.work_data = lctx.work_buffer.data();
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index dd989c5c041f7..4171c126c7b7d 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-llama_add_test(test-grad0.c) # SLOW
-llama_add_test(test-opt.c) # SLOW
+# llama_add_test(test-grad0.c) # SLOW
+# llama_add_test(test-opt.c) # SLOW

From 2392f7a9cd732032cf6662e7ce3bdef6115826b1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 20:43:43 +0300
Subject: [PATCH 11/20] ggml : add ggml_graph_compute_with_ctx()

- backwards compatible API
- deduplicates a lot of copy-paste
---
 ggml.c             | 32 ++++++++++-----------
 ggml.h             |  6 +++-
 tests/test-grad0.c | 69 ++++------------------------------------------
 tests/test-opt.c   | 47 +++----------------------------
 4 files changed, 29 insertions(+), 125 deletions(-)

diff --git a/ggml.c b/ggml.c
index 23938fc5f27b2..f8eddd81695e8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16493,21 +16493,17 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
     }
 }
 
-// TODO: avoid allocating memory frequently.
-// TODO: make part of public API - use different name and put warning that it makes allocations
-static void ggml_graph_compute_helper(struct ggml_cgraph * cgraph, int n_threads) {
+// same as ggml_graph_compute() but the work data is allocated as a part of the context
+// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
 
-    if (cplan.work_size > 0) {
-        cplan.work_data = malloc(cplan.work_size);
-        GGML_ASSERT(cplan.work_data);
-    }
+    struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
+    GGML_ASSERT(buf);
 
-    ggml_graph_compute(cgraph, &cplan);
+    cplan.work_data = buf->data;
 
-    if (cplan.work_data) {
-        free(cplan.work_data);
-    }
+    ggml_graph_compute(cgraph, &cplan);
 }
 
 void ggml_graph_reset(struct ggml_cgraph * cgraph) {
@@ -17292,6 +17288,7 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g
 //
 
 static enum ggml_opt_result ggml_opt_adam(
+        struct ggml_context * ctx,
         struct ggml_opt_context * opt,
         struct ggml_opt_params params,
         struct ggml_tensor * f,
@@ -17346,7 +17343,7 @@ static enum ggml_opt_result ggml_opt_adam(
     ggml_graph_reset  (gf);
     ggml_set_f32      (f->grad, 1.0f);
 
-    ggml_graph_compute_helper(gb, params.n_threads);
+    ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
     opt->adam.fx_prev = ggml_get_f32_1d(f, 0);
     opt->adam.fx_best = opt->adam.fx_prev;
@@ -17427,7 +17424,7 @@ static enum ggml_opt_result ggml_opt_adam(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        ggml_graph_compute_helper(gb, params.n_threads);
+        ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
         const float fx = ggml_get_f32_1d(f, 0);
 
@@ -17498,6 +17495,7 @@ struct ggml_lbfgs_iteration_data {
 };
 
 static enum ggml_opt_result linesearch_backtracking(
+        struct ggml_context * ctx,
         const struct ggml_opt_params * params,
         int nx,
         float * x,
@@ -17549,7 +17547,7 @@ static enum ggml_opt_result linesearch_backtracking(
             ggml_graph_reset  (gf);
             ggml_set_f32      (f->grad, 1.0f);
 
-            ggml_graph_compute_helper(gb, params->n_threads);
+            ggml_graph_compute_with_ctx(ctx, gb, params->n_threads);
 
             ggml_opt_get_grad(np, ps, g);
 
@@ -17669,7 +17667,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_graph_reset  (gf);
         ggml_set_f32      (f->grad, 1.0f);
 
-        ggml_graph_compute_helper(gb, params.n_threads);
+        ggml_graph_compute_with_ctx(ctx, gb, params.n_threads);
 
         ggml_opt_get_grad(np, ps, g);
 
@@ -17728,7 +17726,7 @@ static enum ggml_opt_result ggml_opt_lbfgs(
         ggml_vec_cpy_f32(nx, xp, x);
         ggml_vec_cpy_f32(nx, gp, g);
 
-        ls = linesearch_backtracking(&params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
+        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, step, xp, f, gf, gb, np, ps);
 
         if (ls < 0) {
             // linesearch failed - go back to the previous point and return
@@ -18030,7 +18028,7 @@ enum ggml_opt_result ggml_opt_resume_g(
     switch (opt->params.type) {
         case GGML_OPT_ADAM:
             {
-                result = ggml_opt_adam(opt, opt->params, f, gf, gb);
+                result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb);
             } break;
         case GGML_OPT_LBFGS:
             {
diff --git a/ggml.h b/ggml.h
index 78870147c6857..906045c9e1de1 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1306,7 +1306,7 @@ extern "C" {
 
     GGML_API void ggml_set_param(
             struct ggml_context * ctx,
-            struct ggml_tensor * tensor);
+            struct ggml_tensor  * tensor);
 
     GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
 
@@ -1319,6 +1319,10 @@ extern "C" {
     GGML_API              void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
     GGML_API              void ggml_graph_reset  (struct ggml_cgraph * cgraph);
 
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+
     GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
 
     GGML_API void               ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
diff --git a/tests/test-grad0.c b/tests/test-grad0.c
index 9c27e603e6bee..da4001ce5269f 100644
--- a/tests/test-grad0.c
+++ b/tests/test-grad0.c
@@ -195,32 +195,6 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
 
 }
 
-struct work_buffer {
-    size_t    size;
-    uint8_t * data;
-};
-
-static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) {
-    if (size == 0) {
-        return NULL;
-    }
-
-    GGML_ASSERT(buf);
-
-    if (buf->size == 0) {
-        buf->data = malloc(size);
-        buf->size = size;
-    } else if (buf->size < size) {
-        buf->data = realloc(buf->data, size);
-        buf->size = size;
-    } else {
-        // skip shrinking.
-    }
-
-    GGML_ASSERT(buf->data);
-    return buf->data;
-}
-
 bool check_gradient(
         const char * op_name,
         struct ggml_context * ctx0,
@@ -247,28 +221,12 @@ bool check_gradient(
     struct ggml_cgraph gf = ggml_build_forward (f);
     struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
 
-    struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL };
-
-    {
-        struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
-        if (pf.work_size > 0) {
-            pf.work_data = malloc(pf.work_size);
-            GGML_ASSERT(pf.work_data);
-        }
-        ggml_graph_compute(&gf, &pf);
-        if (pf.work_data) {
-            free(pf.work_data);
-        }
-    }
+    ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
     ggml_graph_reset  (&gf);
     ggml_set_f32      (f->grad, 1.0f);
 
-    {
-        struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads);
-        pf.work_data = work_buffer_resize(&buf, pf.work_size);
-        ggml_graph_compute(&gf, &pf);
-    }
+    ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
 
     // ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
     // ggml_graph_dump_dot(&gb, &gf,  "test-grad0-backward.dot");
@@ -282,24 +240,15 @@ bool check_gradient(
             const float xp = x0 + eps;
             set_element(x[i], k, xp);
 
-            {
-                struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
-                pf.work_data = work_buffer_resize(&buf, pf.work_size);
-                ggml_graph_compute(&gf, &pf);
-            }
+            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
             const float f0 = ggml_get_f32_1d(f, 0);
 
             set_element(x[i], k, xm);
 
-            {
-                struct ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
-                pf.work_data = work_buffer_resize(&buf, pf.work_size);
-                ggml_graph_compute(&gf, &pf);
-            }
+            ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
 
             const float f1 = ggml_get_f32_1d(f, 0);
-
             const float g0 = (f0 - f1)/(2.0f*eps);
 
             set_element(x[i], k, x0);
@@ -308,11 +257,7 @@ bool check_gradient(
             ggml_graph_reset  (&gf);
             ggml_set_f32      (f->grad, 1.0f);
 
-            {
-                struct ggml_cplan pf = ggml_graph_plan(&gb, n_threads);
-                pf.work_data = work_buffer_resize(&buf, pf.work_size);
-                ggml_graph_compute(&gf, &pf);
-            }
+            ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
 
             const float g1 = get_element(x[i]->grad, k);
 
@@ -328,10 +273,6 @@ bool check_gradient(
         }
     }
 
-    if (buf.data) {
-        free(buf.data);
-    }
-
     return true;
 }
 
diff --git a/tests/test-opt.c b/tests/test-opt.c
index 3ed246b3b3b65..e928a7df7ee68 100644
--- a/tests/test-opt.c
+++ b/tests/test-opt.c
@@ -115,31 +115,6 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
     ((float *)t->data)[idx] = value;
 }
 
-
-struct work_buffer {
-    size_t    size;
-    uint8_t * data;
-};
-
-static uint8_t * work_buffer_resize(struct work_buffer * buf, size_t size) {
-    if (size == 0) {
-        return NULL;
-    }
-
-    if (buf->size == 0) {
-        buf->data = malloc(size);
-        buf->size = size;
-    } else if (buf->size < size) {
-        buf->data = realloc(buf->data, size);
-        buf->size = size;
-    } else {
-        // skip shrinking.
-    }
-
-    GGML_ASSERT(buf->data);
-    return buf->data;
-}
-
 int main(void) {
     struct ggml_init_params params = {
         .mem_size   = 1024*1024*1024,
@@ -163,16 +138,10 @@ int main(void) {
     struct ggml_tensor * d  = ggml_sub(ctx, c, ab);
     struct ggml_tensor * e  = ggml_sum(ctx, ggml_sqr(ctx, d));
 
-
     struct ggml_cgraph ge = ggml_build_forward(e);
-    ggml_graph_reset  (&ge);
+    ggml_graph_reset(&ge);
 
-    struct work_buffer buf = { /*.size = */ 0, /*.data =*/ NULL };
-    {
-        struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1);
-        pe.work_data = work_buffer_resize(&buf, pe.work_size);
-        ggml_graph_compute(&ge, &pe);
-    }
+    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
 
     const float fe = ggml_get_f32_1d(e, 0);
     printf("%s: e = %.4f\n", __func__, fe);
@@ -181,17 +150,9 @@ int main(void) {
 
     ggml_opt(ctx, opt_params, e);
 
-    ggml_graph_reset  (&ge);
+    ggml_graph_reset(&ge);
 
-    {
-        struct ggml_cplan pe = ggml_graph_plan(&ge, /*n_threads*/ 1);
-        pe.work_data = work_buffer_resize(&buf, pe.work_size);
-        ggml_graph_compute(&ge, &pe);
-    }
-
-    if (buf.data) {
-        free(buf.data);
-    }
+    ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
 
     const float fe_opt = ggml_get_f32_1d(e, 0);
     printf("%s: original  e = %.4f\n", __func__, fe);

From 1b9994f8098b3bb49e82672ccec40a704769d07f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 20:57:12 +0300
Subject: [PATCH 12/20] ci : enable test-grad0

---
 .github/workflows/build.yml | 25 +++++++++++++++++++++----
 tests/CMakeLists.txt        |  2 +-
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 12481e8be7cf7..547b03a7a7772 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -41,6 +41,10 @@ jobs:
   ubuntu-latest-cmake:
     runs-on: ubuntu-latest
 
+    env:
+      GGML_NLOOP: 3
+      GGML_NITER: 1
+
     steps:
       - name: Clone
         id: checkout
@@ -64,11 +68,15 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose
+          ctest --verbose --timeout 900
 
   ubuntu-latest-cmake-sanitizer:
     runs-on: ubuntu-latest
 
+    env:
+      GGML_NLOOP: 3
+      GGML_NITER: 1
+
     continue-on-error: true
 
     strategy:
@@ -99,7 +107,7 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose
+          ctest --verbose --timeout 900
 
   macOS-latest-make:
     runs-on: macos-latest
@@ -123,6 +131,10 @@ jobs:
   macOS-latest-cmake:
     runs-on: macos-latest
 
+    env:
+      GGML_NLOOP: 3
+      GGML_NITER: 1
+
     steps:
       - name: Clone
         id: checkout
@@ -147,10 +159,15 @@ jobs:
         id: cmake_test
         run: |
           cd build
-          ctest --verbose
+          ctest --verbose --timeout 900
 
   windows-latest-cmake:
     runs-on: windows-latest
+
+    env:
+      GGML_NLOOP: 3
+      GGML_NITER: 1
+
     env:
       OPENBLAS_VERSION: 0.3.23
       OPENCL_VERSION: 2023.04.17
@@ -249,7 +266,7 @@ jobs:
         if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
         run: |
           cd build
-          ctest -C Release --verbose
+          ctest -C Release --verbose --timeout 900
 
       - name: Get commit hash
         id: commit
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 4171c126c7b7d..1acf050a743e4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
-# llama_add_test(test-grad0.c) # SLOW
+llama_add_test(test-grad0.c) # SLOW
 # llama_add_test(test-opt.c) # SLOW

From a67404e7497445b8f63c750ec6f285304b7b13ee Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 21:08:25 +0300
Subject: [PATCH 13/20] examples : factor out plan allocation into a helper
 function

---
 examples/baby-llama/baby-llama.cpp            | 44 +++++++------------
 .../train-text-from-scratch.cpp               | 39 ++++++----------
 ggml.h                                        | 13 +-----
 3 files changed, 31 insertions(+), 65 deletions(-)

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index 5d66089b1e22e..4965881ecec22 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
     return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
 }
 
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
 struct ggml_tensor * randomize_tensor(
         struct ggml_tensor * tensor,
         int ndims,
@@ -1596,15 +1607,7 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
 
         ggml_build_forward_expand(&gf, e);
-
-        {
-            struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
-            if (pf.work_size > 0) {
-                work_buffer.resize(pf.work_size);
-                pf.work_data = work_buffer.data();
-            }
-            ggml_graph_compute(&gf, &pf);
-        }
+        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
 
@@ -1620,15 +1623,7 @@ int main(int argc, char ** argv) {
         ggml_opt(ctx0, opt_params_lbfgs, e);
         //
         ggml_build_forward_expand(&gf, e);
-
-        {
-            struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
-            if (pf.work_size > 0) {
-                work_buffer.resize(pf.work_size);
-                pf.work_data = work_buffer.data();
-            }
-            ggml_graph_compute(&gf, &pf);
-        }
+        ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
 
@@ -1681,15 +1676,7 @@ int main(int argc, char ** argv) {
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(&gf, logits);
-
-            {
-                struct ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
-                if (pf.work_size > 0) {
-                    work_buffer.resize(pf.work_size);
-                    pf.work_data = work_buffer.data();
-                }
-                ggml_graph_compute(&gf, &pf);
-            }
+            ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
 
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
@@ -1711,10 +1698,11 @@ int main(int argc, char ** argv) {
     }
 
     print_matrix(model.tok_embeddings);
-
     printf("done\n");
+
     // ggml_free(kv_self.ctx);
     // ggml_free(model_lora.ctx);
     ggml_free(model.ctx);
+
     return 0;
 }
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index 11ffbe2e1e3a1..b96fdcdc44b57 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
     return rnd->rd(rnd->gen);
 }
 
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
 struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
     float scale = 1.0f; // xavier
     switch (tensor->n_dims) {
@@ -3246,14 +3257,7 @@ int main(int argc, char ** argv) {
             *gb = ggml_build_backward(ctx0, gf, true);
         }
 
-        {
-            ggml_cplan pf = ggml_graph_plan(gf, params.n_threads);
-            if (pf.work_size > 0) {
-                work_buffer.resize(pf.work_size);
-                pf.work_data = work_buffer.data();
-            }
-            ggml_graph_compute(gf, &pf);
-        }
+        ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
 
         size_t used_mem_before_opt = ggml_used_mem(ctx0);
 
@@ -3277,14 +3281,7 @@ int main(int argc, char ** argv) {
         model.train_samples += n_batch;
         model.train_tokens  += n_batch * n_tokens;
 
-        {
-            ggml_cplan pf = ggml_graph_plan(gf, params.n_threads);
-            if (pf.work_size > 0) {
-                work_buffer.resize(pf.work_size);
-                pf.work_data = work_buffer.data();
-            }
-            ggml_graph_compute(gf, &pf);
-        }
+        ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
 
         float error_after_opt = ggml_get_f32_1d(loss, 0);
 
@@ -3371,15 +3368,7 @@ int main(int argc, char ** argv) {
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(&gf, logits);
-
-            {
-                ggml_cplan pf = ggml_graph_plan(&gf, params.n_threads);
-                if (pf.work_size > 0) {
-                    work_buffer.resize(pf.work_size);
-                    pf.work_data = work_buffer.data();
-                }
-                ggml_graph_compute(&gf, &pf);
-            }
+            ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);
 
             //struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             //struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
diff --git a/ggml.h b/ggml.h
index 906045c9e1de1..d6cde970de53b 100644
--- a/ggml.h
+++ b/ggml.h
@@ -65,18 +65,7 @@
 //       ggml_set_f32(a, 3.0f);
 //       ggml_set_f32(b, 4.0f);
 //
-//       struct ggml_cplan pf = ggml_graph_compute_make_plan(&gf, n_threads);
-//
-//       if (pf.work_size > 0) {
-//           pf.work_data = malloc(pf.work_size);
-//           GGML_ASSERT(pf.work_data);
-//       }
-//
-//       ggml_graph_compute(&gf, &pf);
-//
-//       if (pf.work_data) {
-//           free(pf.work_data);
-//       }
+//       ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
 //
 //       printf("f = %f\n", ggml_get_f32_1d(f, 0));
 //

From 2d3a5252f9f08616430a2c473ca7ffa784cca46b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 21:12:25 +0300
Subject: [PATCH 14/20] llama : factor out plan stuff into a helper function

---
 llama.cpp | 56 +++++++++++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 33 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 5c9aea9de24fc..0aecbeedce92a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -79,6 +79,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
     (void) tensor;
 }
 
+//
+// ggml helpers
+//
+
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
+//
+// memory sizes
+//
+
 static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
 {
     static std::map<e_model, size_t> k_sizes = {
@@ -761,7 +780,6 @@ struct llama_model_loader {
 
 };
 
-
 //
 // kv cache
 //
@@ -1623,12 +1641,7 @@ static bool llama_eval_internal(
 #endif
 
     if (call_ggml_graph_compute) {
-        ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
-        if (pf.work_size > 0) {
-            lctx.work_buffer.resize(pf.work_size);
-            pf.work_data = lctx.work_buffer.data();
-        }
-        ggml_graph_compute(&gf, &pf);
+        ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
     }
 
     if (cgraph_fname) {
@@ -2983,14 +2996,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
             struct ggml_cgraph gf = ggml_build_forward(r);
 
-            {
-                ggml_cplan pf = ggml_graph_plan(&gf, n_threads);
-                if (pf.work_size > 0) {
-                    work_buffer.resize(pf.work_size);
-                    pf.work_data = work_buffer.data();
-                }
-                ggml_graph_compute(&gf, &pf);
-            }
+            ggml_graph_compute_helper(work_buffer, &gf, n_threads);
 
             // we won't need these tensors again, reset the context to save memory
             ggml_free(lora_ctx);
@@ -3162,15 +3168,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-
-            {
-                ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
-                if (pf.work_size > 0) {
-                    ctx->work_buffer.resize(pf.work_size);
-                    pf.work_data = ctx->work_buffer.data();
-                }
-                ggml_graph_compute(&gf, &pf);
-            }
+            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
         }
@@ -3275,15 +3273,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
 
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
             ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-
-            {
-                ggml_cplan pf = ggml_graph_plan(&gf, /*n_threads*/ 1);
-                if (pf.work_size > 0) {
-                    ctx->work_buffer.resize(pf.work_size);
-                    pf.work_data = ctx->work_buffer.data();
-                }
-                ggml_graph_compute(&gf, &pf);
-            }
+            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
         }

From 8fdf86dd253aef32a7e4acd97bde2150e0f3c40a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 21:15:17 +0300
Subject: [PATCH 15/20] ci : fix env

---
 .github/workflows/build.yml | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 547b03a7a7772..a576139efd0ee 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -16,7 +16,9 @@ on:
     paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
 
 env:
- BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  GGML_NLOOP: 3
+  GGML_NITER: 1
 
 jobs:
   ubuntu-focal-make:
@@ -41,10 +43,6 @@ jobs:
   ubuntu-latest-cmake:
     runs-on: ubuntu-latest
 
-    env:
-      GGML_NLOOP: 3
-      GGML_NITER: 1
-
     steps:
       - name: Clone
         id: checkout
@@ -73,10 +71,6 @@ jobs:
   ubuntu-latest-cmake-sanitizer:
     runs-on: ubuntu-latest
 
-    env:
-      GGML_NLOOP: 3
-      GGML_NITER: 1
-
     continue-on-error: true
 
     strategy:
@@ -131,10 +125,6 @@ jobs:
   macOS-latest-cmake:
     runs-on: macos-latest
 
-    env:
-      GGML_NLOOP: 3
-      GGML_NITER: 1
-
     steps:
       - name: Clone
         id: checkout
@@ -164,10 +154,6 @@ jobs:
   windows-latest-cmake:
     runs-on: windows-latest
 
-    env:
-      GGML_NLOOP: 3
-      GGML_NITER: 1
-
     env:
       OPENBLAS_VERSION: 0.3.23
       OPENCL_VERSION: 2023.04.17

From 9c9bdaf0b8e9e3d04c0caa83a7722a14b629e475 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 21:18:42 +0300
Subject: [PATCH 16/20] llama : fix duplicate symbols + refactor example
 benchmark

---
 examples/benchmark/benchmark-matmult.cpp | 38 +++++++++---------------
 llama.cpp                                |  2 +-
 2 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
index 840f4fe525cfb..f7215f43bb31c 100644
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -20,6 +20,17 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    ggml_graph_compute(graph, &plan);
+}
+
 float tensor_sum_elements(const ggml_tensor * tensor) {
     float sum = 0;
     if (tensor->type==GGML_TYPE_F32) {
@@ -166,14 +177,7 @@ int main(int argc, char ** argv)  {
 
     std::vector<uint8_t> work_buffer;
 
-    {
-        ggml_cplan pf = ggml_graph_plan(&gf, benchmark_params.n_threads);
-        if (pf.work_size > 0) {
-            work_buffer.resize(pf.work_size);
-            pf.work_data = work_buffer.data();
-        }
-        ggml_graph_compute(&gf, &pf);
-    }
+    ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
 
     TENSOR_DUMP(gf.nodes[0]);
 
@@ -227,14 +231,7 @@ int main(int argc, char ** argv)  {
 
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
-        {
-            ggml_cplan pf31 = ggml_graph_plan(&gf31, benchmark_params.n_threads);
-            if (pf31.work_size > 0) {
-                work_buffer.resize(pf31.work_size);
-                pf31.work_data = work_buffer.data();
-            }
-            ggml_graph_compute(&gf31, &pf31);
-        }
+        ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
 
         long long int stop = ggml_time_us();
         long long int usec = stop-start;
@@ -267,14 +264,7 @@ int main(int argc, char ** argv)  {
         }
 
         // Running a different graph computation to make sure we override the CPU cache lines
-        {
-            ggml_cplan pf32 = ggml_graph_plan(&gf32, benchmark_params.n_threads);
-            if (pf32.work_size > 0) {
-                work_buffer.resize(pf32.work_size);
-                pf32.work_data = work_buffer.data();
-            }
-            ggml_graph_compute(&gf32, &pf32);
-        }
+        ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
     }
     printf("\n");
     printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
diff --git a/llama.cpp b/llama.cpp
index 0aecbeedce92a..5221ab5a2dd27 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -83,7 +83,7 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
 // ggml helpers
 //
 
-void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
+static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 
     if (plan.work_size > 0) {

From 8dc7f104f82c81f51175050edc91c642d33b8927 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 21:28:10 +0300
Subject: [PATCH 17/20] ggml : remove obsolete assert + refactor n_tasks
 section

---
 ggml.c | 549 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 273 insertions(+), 276 deletions(-)

diff --git a/ggml.c b/ggml.c
index f8eddd81695e8..27232af28e0e0 100644
--- a/ggml.c
+++ b/ggml.c
@@ -10717,8 +10717,6 @@ static void ggml_compute_forward_mul_mat(
 
         float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));
 
-        assert(ne00 % 32 == 0);
-
         for (int64_t ic = 0; ic < ne11; ++ic) {
             vec_dot(ne00, &dst_col[ic*ne0], src0_row, (void *) (src1_col + ic*row_size));
         }
@@ -16078,328 +16076,327 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
         n_threads = GGML_DEFAULT_N_THREADS;
     }
 
+    size_t work_size = 0;
+
     struct ggml_cplan cplan;
     memset(&cplan, 0, sizeof(struct ggml_cplan));
 
-    int * n_tasks = cplan.n_tasks;
+    // thread scheduling for the different operations + work buffer size estimation
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        int n_tasks = 1;
 
-    size_t work_size = 0;
+        struct ggml_tensor * node = cgraph->nodes[i];
 
-    // initialize tasks + work buffer
-    {
-        // thread scheduling for the different operations
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            struct ggml_tensor * node = cgraph->nodes[i];
-
-            switch (node->op) {
-                case GGML_OP_CPY:
-                case GGML_OP_DUP:
-                    {
-                        n_tasks[i] = n_threads;
-
-                        size_t cur = 0;
-                        if (ggml_is_quantized(node->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks[i];
-                        }
+        switch (node->op) {
+            case GGML_OP_CPY:
+            case GGML_OP_DUP:
+                {
+                    n_tasks = n_threads;
 
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_ADD:
-                case GGML_OP_ADD1:
-                    {
-                        n_tasks[i] = n_threads;
+                    size_t cur = 0;
+                    if (ggml_is_quantized(node->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_tasks;
+                    }
 
-                        size_t cur = 0;
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_ADD:
+            case GGML_OP_ADD1:
+                {
+                    n_tasks = n_threads;
 
-                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks[i];
-                        }
+                    size_t cur = 0;
 
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_ACC:
-                    {
-                        n_tasks[i] = n_threads;
+                    if (ggml_is_quantized(node->src0->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src0->ne[0] * n_tasks;
+                    }
 
-                        size_t cur = 0;
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_ACC:
+                {
+                    n_tasks = n_threads;
 
-                        if (ggml_is_quantized(node->src0->type)) {
-                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks[i];
-                        }
+                    size_t cur = 0;
 
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_SUB:
-                case GGML_OP_DIV:
-                case GGML_OP_SQR:
-                case GGML_OP_SQRT:
-                case GGML_OP_LOG:
-                case GGML_OP_SUM:
-                case GGML_OP_SUM_ROWS:
-                case GGML_OP_MEAN:
-                case GGML_OP_ARGMAX:
-                case GGML_OP_REPEAT:
-                case GGML_OP_REPEAT_BACK:
-                case GGML_OP_ABS:
-                case GGML_OP_SGN:
-                case GGML_OP_NEG:
-                case GGML_OP_STEP:
-                case GGML_OP_TANH:
-                case GGML_OP_ELU:
-                case GGML_OP_RELU:
-                    {
-                        n_tasks[i] = 1;
-                    } break;
-                case GGML_OP_MUL:
-                case GGML_OP_GELU:
-                case GGML_OP_GELU_QUICK:
-                case GGML_OP_SILU:
-                case GGML_OP_SILU_BACK:
-                case GGML_OP_NORM:
-                case GGML_OP_RMS_NORM:
-                case GGML_OP_RMS_NORM_BACK:
-                    {
-                        n_tasks[i] = n_threads;
-                    } break;
-                case GGML_OP_MUL_MAT:
-                case GGML_OP_OUT_PROD:
-                    {
-                        n_tasks[i] = n_threads;
-
-                        // TODO: use different scheduling for different matrix sizes
-                        //const int nr0 = ggml_nrows(node->src0);
-                        //const int nr1 = ggml_nrows(node->src1);
-
-                        //n_tasks[i] = MIN(n_threads, MAX(1, nr0/128));
-                        //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\n", nr0, nr1, nr0*nr1, n_tasks[i]);
-
-                        size_t cur = 0;
-                        const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
+                    if (ggml_is_quantized(node->src0->type)) {
+                        cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->src1->ne[0] * n_tasks;
+                    }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_SUB:
+            case GGML_OP_DIV:
+            case GGML_OP_SQR:
+            case GGML_OP_SQRT:
+            case GGML_OP_LOG:
+            case GGML_OP_SUM:
+            case GGML_OP_SUM_ROWS:
+            case GGML_OP_MEAN:
+            case GGML_OP_ARGMAX:
+            case GGML_OP_REPEAT:
+            case GGML_OP_REPEAT_BACK:
+            case GGML_OP_ABS:
+            case GGML_OP_SGN:
+            case GGML_OP_NEG:
+            case GGML_OP_STEP:
+            case GGML_OP_TANH:
+            case GGML_OP_ELU:
+            case GGML_OP_RELU:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_MUL:
+            case GGML_OP_GELU:
+            case GGML_OP_GELU_QUICK:
+            case GGML_OP_SILU:
+            case GGML_OP_SILU_BACK:
+            case GGML_OP_NORM:
+            case GGML_OP_RMS_NORM:
+            case GGML_OP_RMS_NORM_BACK:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_MUL_MAT:
+            case GGML_OP_OUT_PROD:
+                {
+                    n_tasks = n_threads;
+
+                    // TODO: use different scheduling for different matrix sizes
+                    //const int nr0 = ggml_nrows(node->src0);
+                    //const int nr1 = ggml_nrows(node->src1);
+
+                    //n_tasks = MIN(n_threads, MAX(1, nr0/128));
+                    //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
+
+                    size_t cur = 0;
+                    const enum ggml_type vec_dot_type = type_traits[node->src0->type].vec_dot_type;
 
 #if defined(GGML_USE_CUBLAS)
-                        if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
-                            n_tasks[i] = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
-                        }
-                        else
+                    if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                    }
+                    else
 #elif defined(GGML_USE_CLBLAST)
                         if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
-                            n_tasks[i] = 1; // TODO: this actually is doing nothing
-                                                //       the threads are still spinning
+                            n_tasks = 1; // TODO: this actually is doing nothing
+                                         //       the threads are still spinning
                             cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
                         }
                         else
 #endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                        if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                            n_tasks[i] = 1; // TODO: this actually is doing nothing
-                                               //       the threads are still spinning
-                            if (node->src0->type != GGML_TYPE_F32) {
-                                // here we need memory just for single 2D matrix from src0
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                            }
-                        } else
+                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
+                                n_tasks = 1; // TODO: this actually is doing nothing
+                                             //       the threads are still spinning
+                                if (node->src0->type != GGML_TYPE_F32) {
+                                    // here we need memory just for single 2D matrix from src0
+                                    cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+                                }
+                            } else
 #endif
-                        if (node->src1->type != vec_dot_type) {
-                            cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
-                        } else {
-                            cur = 0;
-                        }
+                                if (node->src1->type != vec_dot_type) {
+                                    cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
+                                } else {
+                                    cur = 0;
+                                }
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_SCALE:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_SET:
+            case GGML_OP_CONT:
+            case GGML_OP_RESHAPE:
+            case GGML_OP_VIEW:
+            case GGML_OP_PERMUTE:
+            case GGML_OP_TRANSPOSE:
+            case GGML_OP_GET_ROWS:
+            case GGML_OP_GET_ROWS_BACK:
+            case GGML_OP_DIAG:
+            case GGML_OP_DIAG_MASK_ZERO:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_DIAG_MASK_INF:
+            case GGML_OP_SOFT_MAX:
+            case GGML_OP_SOFT_MAX_BACK:
+            case GGML_OP_ROPE:
+            case GGML_OP_ROPE_BACK:
+                {
+                    n_tasks = n_threads;
+                } break;
+            case GGML_OP_ALIBI:
+                {
+                    n_tasks = 1; //TODO
+                } break;
+            case GGML_OP_CLAMP:
+                {
+                    n_tasks = 1; //TODO
+                } break;
+            case GGML_OP_CONV_1D:
+                {
+                    n_tasks = n_threads;
+
+                    GGML_ASSERT(node->src0->ne[3] == 1);
+                    GGML_ASSERT(node->src1->ne[2] == 1);
+                    GGML_ASSERT(node->src1->ne[3] == 1);
+
+                    size_t cur = 0;
+                    const int nk = node->src0->ne[0];
 
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_SCALE:
-                    {
-                        n_tasks[i] = 1;
-                    } break;
-                case GGML_OP_SET:
-                case GGML_OP_CONT:
-                case GGML_OP_RESHAPE:
-                case GGML_OP_VIEW:
-                case GGML_OP_PERMUTE:
-                case GGML_OP_TRANSPOSE:
-                case GGML_OP_GET_ROWS:
-                case GGML_OP_GET_ROWS_BACK:
-                case GGML_OP_DIAG:
-                case GGML_OP_DIAG_MASK_ZERO:
-                    {
-                        n_tasks[i] = 1;
-                    } break;
-                case GGML_OP_DIAG_MASK_INF:
-                case GGML_OP_SOFT_MAX:
-                case GGML_OP_SOFT_MAX_BACK:
-                case GGML_OP_ROPE:
-                case GGML_OP_ROPE_BACK:
-                    {
-                        n_tasks[i] = n_threads;
-                    } break;
-                case GGML_OP_ALIBI:
-                    {
-                        n_tasks[i] = 1; //TODO
-                    } break;
-                case GGML_OP_CLAMP:
-                    {
-                        n_tasks[i] = 1; //TODO
-                    } break;
-                case GGML_OP_CONV_1D:
-                    {
-                        n_tasks[i] = n_threads;
-
-                        GGML_ASSERT(node->src0->ne[3] == 1);
-                        GGML_ASSERT(node->src1->ne[2] == 1);
-                        GGML_ASSERT(node->src1->ne[3] == 1);
-
-                        size_t cur = 0;
-                        const int nk = node->src0->ne[0];
-
-                        if (node->src0->type == GGML_TYPE_F16 &&
+                    if (node->src0->type == GGML_TYPE_F16 &&
                             node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(ggml_fp16_t)*(
-                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
-                                    );
-                        } else if (node->src0->type == GGML_TYPE_F32 &&
-                                   node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(float)*(
-                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
-                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
-                                    );
-                        } else {
-                            GGML_ASSERT(false);
-                        }
+                        cur = sizeof(ggml_fp16_t)*(
+                                nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
+                                ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
+                                );
+                    } else if (node->src0->type == GGML_TYPE_F32 &&
+                            node->src1->type == GGML_TYPE_F32) {
+                        cur = sizeof(float)*(
+                                nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +
+                                ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]
+                                );
+                    } else {
+                        GGML_ASSERT(false);
+                    }
 
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_CONV_2D:
-                    {
-                        n_tasks[i] = n_threads;
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CONV_2D:
+                {
+                    n_tasks = n_threads;
 
-                        GGML_ASSERT(node->src1->ne[3] == 1);
+                    GGML_ASSERT(node->src1->ne[3] == 1);
 
-                        const int64_t ne00 = node->src0->ne[0]; // W
-                        const int64_t ne01 = node->src0->ne[1]; // H
-                        const int64_t ne02 = node->src0->ne[2]; // C
-                        const int64_t ne03 = node->src0->ne[3]; // N
+                    const int64_t ne00 = node->src0->ne[0]; // W
+                    const int64_t ne01 = node->src0->ne[1]; // H
+                    const int64_t ne02 = node->src0->ne[2]; // C
+                    const int64_t ne03 = node->src0->ne[3]; // N
 
-                        const int64_t ne10 = node->src1->ne[0]; // W
-                        const int64_t ne11 = node->src1->ne[1]; // H
-                        const int64_t ne12 = node->src1->ne[2]; // C
+                    const int64_t ne10 = node->src1->ne[0]; // W
+                    const int64_t ne11 = node->src1->ne[1]; // H
+                    const int64_t ne12 = node->src1->ne[2]; // C
 
-                        const int64_t nk = ne00*ne01;
+                    const int64_t nk = ne00*ne01;
 
-                        UNUSED(ne02);
-                        UNUSED(ne03);
-                        UNUSED(nk);
+                    UNUSED(ne02);
+                    UNUSED(ne03);
+                    UNUSED(nk);
 
-                        size_t cur = 0;
+                    size_t cur = 0;
 
-                        if (node->src0->type == GGML_TYPE_F16 &&
+                    if (node->src0->type == GGML_TYPE_F16 &&
                             node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
-                        } else if (node->src0->type == GGML_TYPE_F32 &&
-                                   node->src1->type == GGML_TYPE_F32) {
-                            cur = sizeof(float)*      (ne10*ne11*ne12);
-                        } else {
-                            GGML_ASSERT(false);
-                        }
+                        cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12);
+                    } else if (node->src0->type == GGML_TYPE_F32 &&
+                            node->src1->type == GGML_TYPE_F32) {
+                        cur = sizeof(float)*      (ne10*ne11*ne12);
+                    } else {
+                        GGML_ASSERT(false);
+                    }
 
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_ATTN:
-                    {
-                        n_tasks[i] = n_threads;
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_FLASH_ATTN:
+                {
+                    n_tasks = n_threads;
 
-                        size_t cur = 0;
+                    size_t cur = 0;
 
-                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
+                    const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
 
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
-                            cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2
-                        }
+                    if (node->src1->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+                    }
 
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*ne11*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
-                            cur += sizeof(float)*ne11*n_tasks[i]; // this is overestimated by x2
-                        }
+                    if (node->src1->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2
+                    }
 
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_FF:
-                    {
-                        n_tasks[i] = n_threads;
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_FLASH_FF:
+                {
+                    n_tasks = n_threads;
 
-                        size_t cur = 0;
+                    size_t cur = 0;
 
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
-                            cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2
-                        }
+                    if (node->src1->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
+                    }
 
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*node->src1->ne[1]*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
-                            cur += sizeof(float)*node->src1->ne[1]*n_tasks[i]; // this is overestimated by x2
-                        }
+                    if (node->src1->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*node->src1->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*node->src1->ne[1]*n_tasks; // this is overestimated by x2
+                    }
 
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_FLASH_ATTN_BACK:
-                    {
-                        n_tasks[i] = n_threads;
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_FLASH_ATTN_BACK:
+                {
+                    n_tasks = n_threads;
 
-                        size_t cur = 0;
+                    size_t cur = 0;
 
-                        const int64_t    D = node->src0->ne[0];
-                        const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
-                        const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
-                        if (node->src1->type == GGML_TYPE_F32) {
-                            cur  = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
-                            cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2
-                        }
+                    const int64_t    D = node->src0->ne[0];
+                    const int64_t ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);
+                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in ggml_compute_forward_flash_attn_back
+                    if (node->src1->type == GGML_TYPE_F32) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    }
 
-                        if (node->src1->type == GGML_TYPE_F16) {
-                            cur  = sizeof(float)*mxDn*n_tasks[i]; // TODO: this can become (n_tasks[i]-1)
-                            cur += sizeof(float)*mxDn*n_tasks[i]; // this is overestimated by x2
-                        }
+                    if (node->src1->type == GGML_TYPE_F16) {
+                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                    }
 
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_WIN_PART:
-                case GGML_OP_WIN_UNPART:
-                case GGML_OP_MAP_UNARY:
-                case GGML_OP_MAP_BINARY:
-                case GGML_OP_MAP_CUSTOM1:
-                case GGML_OP_MAP_CUSTOM2:
-                case GGML_OP_MAP_CUSTOM3:
-                    {
-                        n_tasks[i] = 1;
-                    } break;
-                case GGML_OP_CROSS_ENTROPY_LOSS:
-                    {
-                        n_tasks[i] = n_threads;
-
-                        size_t cur = ggml_type_size(node->type)*(n_tasks[i] + node->src0->ne[0]*n_tasks[i]);
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
-                    {
-                        n_tasks[i] = n_threads;
-
-                        size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks[i];
-
-                        work_size = MAX(work_size, cur);
-                    } break;
-                case GGML_OP_NONE:
-                    {
-                        n_tasks[i] = 1;
-                    } break;
-                case GGML_OP_COUNT:
-                    {
-                        GGML_ASSERT(false);
-                    } break;
-            }
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_WIN_PART:
+            case GGML_OP_WIN_UNPART:
+            case GGML_OP_MAP_UNARY:
+            case GGML_OP_MAP_BINARY:
+            case GGML_OP_MAP_CUSTOM1:
+            case GGML_OP_MAP_CUSTOM2:
+            case GGML_OP_MAP_CUSTOM3:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_CROSS_ENTROPY_LOSS:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = ggml_type_size(node->type)*(n_tasks + node->src0->ne[0]*n_tasks);
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
+                {
+                    n_tasks = n_threads;
+
+                    size_t cur = ggml_type_size(node->type)*node->src0->ne[0]*n_tasks;
+
+                    work_size = MAX(work_size, cur);
+                } break;
+            case GGML_OP_NONE:
+                {
+                    n_tasks = 1;
+                } break;
+            case GGML_OP_COUNT:
+                {
+                    GGML_ASSERT(false);
+                } break;
         }
+
+        cplan.n_tasks[i] = n_tasks;
     }
 
     if (work_size > 0) {

From 551ed0823441537c323d8769e2c59ff32f403e2d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 21:35:22 +0300
Subject: [PATCH 18/20] ggml : fix indentation in switch

---
 ggml.c | 40 +++++++++++++++++++---------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/ggml.c b/ggml.c
index 27232af28e0e0..69b38dc70a46d 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16176,31 +16176,29 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
                     if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
                         n_tasks = 1; // TODO: this actually is doing nothing
                                      //       the threads are still spinning
-                    }
-                    else
+                    } else
 #elif defined(GGML_USE_CLBLAST)
-                        if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
-                            n_tasks = 1; // TODO: this actually is doing nothing
-                                         //       the threads are still spinning
-                            cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
-                        }
-                        else
+                    if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                        cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
+                    } else
 #endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
-                                n_tasks = 1; // TODO: this actually is doing nothing
-                                             //       the threads are still spinning
-                                if (node->src0->type != GGML_TYPE_F32) {
-                                    // here we need memory just for single 2D matrix from src0
-                                    cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
-                                }
-                            } else
+                    if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
+                        n_tasks = 1; // TODO: this actually is doing nothing
+                                     //       the threads are still spinning
+                        if (node->src0->type != GGML_TYPE_F32) {
+                            // here we need memory just for single 2D matrix from src0
+                            cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+                        }
+                    } else
 #endif
-                                if (node->src1->type != vec_dot_type) {
-                                    cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
-                                } else {
-                                    cur = 0;
-                                }
+                    if (node->src1->type != vec_dot_type) {
+                        cur = GGML_TYPE_SIZE[vec_dot_type]*ggml_nelements(node->src1)/GGML_BLCK_SIZE[vec_dot_type];
+                    } else {
+                        cur = 0;
+                    }
 
                     work_size = MAX(work_size, cur);
                 } break;

From f789f2cef2e40ef9577b29688a55be350849ea99 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jul 2023 21:54:04 +0300
Subject: [PATCH 19/20] llama : avoid unnecessary bool

---
 llama.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 5221ab5a2dd27..ee6ec0920fc9c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1613,14 +1613,11 @@ static bool llama_eval_internal(
     // run the computation
     ggml_build_forward_expand(&gf, cur);
 
-    bool call_ggml_graph_compute = true;
-
 #ifdef GGML_USE_METAL
     if (lctx.ctx_metal && N == 1) {
         ggml_metal_set_n_cb     (lctx.ctx_metal, n_threads);
         ggml_metal_graph_compute(lctx.ctx_metal, &gf);
         ggml_metal_get_tensor   (lctx.ctx_metal, cur);
-        call_ggml_graph_compute = false;
     } else {
         // IMPORTANT:
         // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@@ -1637,12 +1634,12 @@ static bool llama_eval_internal(
             ggml_metal_get_tensor(lctx.ctx_metal, kv_self.k);
             ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
         }
-    }
-#endif
 
-    if (call_ggml_graph_compute) {
         ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
     }
+#else
+    ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
+#endif
 
     if (cgraph_fname) {
         ggml_graph_export(&gf, cgraph_fname);

From c15833c8d6fc6ad3a7239dc2febafca551e61f8a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Fri, 7 Jul 2023 19:13:26 +0300
Subject: [PATCH 20/20] ggml : remove comments from source file and match order
 in header

---
 ggml.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/ggml.c b/ggml.c
index 69b38dc70a46d..828368e671692 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16070,7 +16070,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     return 0;
 }
 
-// Prepare for graph computing.
 struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
     if (n_threads <= 0) {
         n_threads = GGML_DEFAULT_N_THREADS;
@@ -16488,8 +16487,16 @@ void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan)
     }
 }
 
-// same as ggml_graph_compute() but the work data is allocated as a part of the context
-// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+void ggml_graph_reset(struct ggml_cgraph * cgraph) {
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        struct ggml_tensor * grad = cgraph->grads[i];
+
+        if (grad) {
+            ggml_set_zero(grad);
+        }
+    }
+}
+
 void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
 
@@ -16501,16 +16508,6 @@ void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph *
     ggml_graph_compute(cgraph, &cplan);
 }
 
-void ggml_graph_reset(struct ggml_cgraph * cgraph) {
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        struct ggml_tensor * grad = cgraph->grads[i];
-
-        if (grad) {
-            ggml_set_zero(grad);
-        }
-    }
-}
-
 struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
     for (int i = 0; i < cgraph->n_leafs; i++) {
         struct ggml_tensor * leaf = cgraph->leafs[i];