From 6fc5f17e21feb5c91fccc7c2e0c028688d5a7031 Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Sun, 21 May 2023 14:09:52 -0400
Subject: [PATCH 01/18] detect NUMA systems and pin work threads to nodes
 (linux)

---
 ggml.c    | 177 ++++++++++++++++++++++++++++++++++++++++++++++--------
 ggml.h    |   3 +
 llama.cpp |   1 +
 3 files changed, 156 insertions(+), 25 deletions(-)

diff --git a/ggml.c b/ggml.c
index d36bb22815874..910c2c03a3a12 100644
--- a/ggml.c
+++ b/ggml.c
@@ -76,6 +76,11 @@ static int sched_yield (void) {
 #include <stdatomic.h>
 
 typedef void* thread_ret_t;
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
 #endif
 
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -103,6 +108,30 @@ typedef void* thread_ret_t;
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL  2
 
+//
+// logging
+//
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
 #ifdef GGML_USE_ACCELERATE
 // uncomment to use vDSP for soft max computation
 // note: not sure if it is actually faster
@@ -395,7 +424,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
     }
 }
 
-
 //
 // timing
 //
@@ -452,6 +480,85 @@ int64_t ggml_cycles_per_ms(void) {
 #define ggml_perf_cycles_per_ms() 0
 #endif
 
+//
+// NUMA support
+//
+
+struct ggml_numa_node
+{
+    uint32_t *cpus; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes
+{
+    struct ggml_numa_node *nodes;
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+};
+
+struct ggml_numa_nodes ggml_numa = {
+    .nodes = NULL,
+    .n_nodes = 0,
+    .total_cpus = 0,
+};
+
+void ggml_numa_init(void)
+{
+    if (ggml_numa.n_nodes > 0) return;
+#ifdef __linux__
+    struct stat st;
+    char path[256];
+    int rv;
+    // enumerate nodes
+    while (true) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", ggml_numa.n_nodes);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) break;
+        ++ggml_numa.n_nodes;
+    }
+    // enumerate CPUs
+    while (true) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", ggml_numa.total_cpus);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) break;
+        ++ggml_numa.total_cpus;
+    }
+    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", ggml_numa.n_nodes, ggml_numa.total_cpus);
+    ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
+    GGML_ASSERT(ggml_numa.nodes != NULL);
+    for (uint32_t n = 0; n < ggml_numa.n_nodes; ++n) {
+        struct ggml_numa_node *node = &ggml_numa.nodes[n];
+        node->cpus = calloc(ggml_numa.total_cpus, sizeof(uint32_t));
+        GGML_ASSERT(node->cpus != NULL);
+        GGML_PRINT_DEBUG("CPUs on node %u:", n);
+        for (uint32_t c = 0; c < ggml_numa.total_cpus; ++c) {
+            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
+            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+            if (stat(path, &st) == 0) {
+                node->cpus[node->n_cpus++] = c;
+                GGML_PRINT_DEBUG(" %u", c);
+            }
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+    if (ggml_is_numa()) {
+        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
+        if (fptr != NULL) {
+            char buf[42];
+            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
+                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
+            }
+            fclose(fptr);
+        }
+    }
+#else
+    // TODO
+#endif
+}
+
+bool ggml_is_numa(void) { return ggml_numa.n_nodes > 1; }
+
 //
 // cache line
 //
@@ -3405,30 +3512,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
     *s = 1.f/(*s);
 }
 
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
 //
 // data types
 //
@@ -13966,6 +14049,49 @@ typedef pthread_t ggml_thread_t;
 
 #endif
 
+#ifdef __linux__
+void set_numa_thread_affinity(int thread_n, int n_threads)
+{
+    if (!ggml_is_numa()) return;
+    // run thread on node_num thread_n / (threads per node)
+    int node_num = thread_n / (n_threads / ggml_numa.n_nodes);
+    struct ggml_numa_node *node = &ggml_numa.nodes[node_num];
+    size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
+    cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
+    CPU_ZERO_S(setsize, cpus);
+    for (size_t i=0; i < node->n_cpus; ++i) {
+        CPU_SET_S(node->cpus[i], setsize, cpus);
+    }
+    int rv;
+    if ((rv = pthread_setaffinity_np(pthread_self(), setsize, cpus))) {
+            fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
+                    strerror(rv));
+    }
+    CPU_FREE(cpus);
+}
+void clear_numa_thread_affinity(void)
+{
+    if (!ggml_is_numa()) return;
+    size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
+    cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
+    CPU_ZERO_S(setsize, cpus);
+    for (unsigned i=0; i < ggml_numa.total_cpus; ++i) {
+        CPU_SET_S(i, setsize, cpus);
+    }
+    int rv;
+    if((rv = pthread_setaffinity_np(pthread_self(), setsize, cpus))) {
+        fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
+            strerror(rv));
+    }
+    CPU_FREE(cpus);
+}
+#else
+// TODO: Windows etc.
+// (the linux implementation may also work on BSD, someone should test)
+void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
+void clear_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
+#endif
+
 struct ggml_compute_state_shared {
     ggml_lock_t spin;
 
@@ -13990,6 +14116,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
     const int n_threads = state->shared->n_threads;
+    set_numa_thread_affinity(state->params.ith, n_threads);
 
     while (true) {
         if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
diff --git a/ggml.h b/ggml.h
index 51a616c501bb3..305cfc3aea99b 100644
--- a/ggml.h
+++ b/ggml.h
@@ -417,6 +417,9 @@ extern "C" {
     GGML_API int64_t ggml_cycles(void);
     GGML_API int64_t ggml_cycles_per_ms(void);
 
+    GGML_API void    ggml_numa_init(void); // call once for better performance on NUMA systems
+    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
     GGML_API void    ggml_print_object (const struct ggml_object * obj);
     GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
 
diff --git a/llama.cpp b/llama.cpp
index 4cbc8d6b63752..468f96cc142d0 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -851,6 +851,7 @@ bool llama_mlock_supported() {
 
 void llama_init_backend() {
     ggml_time_init();
+    ggml_numa_init();
 
     // needed to initialize f16 tables
     {

From 0d23f8ce8da6bb13d557233c7b76a5563055fcf5 Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Sun, 21 May 2023 16:33:10 -0400
Subject: [PATCH 02/18] disable mmap prefetch/readahead for NUMA systems

---
 llama-util.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/llama-util.h b/llama-util.h
index 3cac9f681800b..1f6c0d9cdcbcd 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -163,6 +163,9 @@ static std::string llama_format_win_err(DWORD err) {
 }
 #endif
 
+extern "C" {
+bool ggml_is_numa();
+}
 struct llama_mmap {
     void * addr;
     size_t size;
@@ -176,8 +179,10 @@ struct llama_mmap {
         size = file->size;
         int fd = fileno(file->fp);
         int flags = MAP_SHARED;
+        // prefetch/readahead impairs performance on NUMA systems
+        if (ggml_is_numa()) prefetch = 0;
 #ifdef __linux__
-        flags |= MAP_POPULATE;
+        if (prefetch) flags |= MAP_POPULATE;
 #endif
         addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
         if (addr == MAP_FAILED) {
@@ -191,6 +196,14 @@ struct llama_mmap {
                         strerror(errno));
             }
         }
+        if (ggml_is_numa()) {
+            // advise the kernel not to use readahead
+            // (because the next page might not belong on the same node)
+            if (madvise(addr, file->size, MADV_RANDOM)) {
+                fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
+                        strerror(errno));
+            }
+        }
     }
 
     ~llama_mmap() {

From 9d058c2096b9f1f300e1ee16f5740a6a0a342917 Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Sun, 21 May 2023 18:11:03 -0400
Subject: [PATCH 03/18] avoid sending finalize op to thread pool if it does
 nothing

---
 ggml.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 910c2c03a3a12..4ac7cb51a3414 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3698,6 +3698,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "f(x,y)",
 };
 
+// only send finalize op to thread pool if it actually does something
+// currently none of them?
+static const bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = {
+    0
+};
+
 static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -14541,7 +14547,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         }
 
         // FINALIZE
-        if (node->n_tasks > 1) {
+        if (node->n_tasks > 1 && GGML_OP_HAS_FINALIZE[node->op]) {
             if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
                 atomic_store(&state_shared.has_work, false);
             }
@@ -14577,7 +14583,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         ggml_compute_forward(&params, node);
 
         // wait for thread pool
-        if (node->n_tasks > 1) {
+        if (node->n_tasks > 1 && GGML_OP_HAS_FINALIZE[node->op]) {
             if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
                 atomic_store(&state_shared.has_work, false);
             }

From 2c1b5ae1971c04df575cae4c828c0b3ef8fb57dc Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Tue, 23 May 2023 17:08:37 -0400
Subject: [PATCH 04/18] silence robot

---
 ggml.c       | 26 +++++++++++++++-----------
 llama-util.h |  4 ++--
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/ggml.c b/ggml.c
index 4ac7cb51a3414..6017d9b394984 100644
--- a/ggml.c
+++ b/ggml.c
@@ -505,7 +505,7 @@ struct ggml_numa_nodes ggml_numa = {
 
 void ggml_numa_init(void)
 {
-    if (ggml_numa.n_nodes > 0) return;
+    if (ggml_numa.n_nodes > 0) { return; }
 #ifdef __linux__
     struct stat st;
     char path[256];
@@ -514,17 +514,21 @@ void ggml_numa_init(void)
     while (true) {
         rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", ggml_numa.n_nodes);
         GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) break;
+        if (stat(path, &st) != 0) { break; }
         ++ggml_numa.n_nodes;
     }
     // enumerate CPUs
     while (true) {
         rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", ggml_numa.total_cpus);
         GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) break;
+        if (stat(path, &st) != 0) { break; }
         ++ggml_numa.total_cpus;
     }
     GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", ggml_numa.n_nodes, ggml_numa.total_cpus);
+    if (ggml_numa.n_nodes < 1 || ggml_numa.total_cpus < 1) {
+        ggml_numa.n_nodes = 0;
+        return;
+    }
     ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
     GGML_ASSERT(ggml_numa.nodes != NULL);
     for (uint32_t n = 0; n < ggml_numa.n_nodes; ++n) {
@@ -14058,18 +14062,18 @@ typedef pthread_t ggml_thread_t;
 #ifdef __linux__
 void set_numa_thread_affinity(int thread_n, int n_threads)
 {
-    if (!ggml_is_numa()) return;
+    if (!ggml_is_numa()) { return; }
     // run thread on node_num thread_n / (threads per node)
     int node_num = thread_n / (n_threads / ggml_numa.n_nodes);
     struct ggml_numa_node *node = &ggml_numa.nodes[node_num];
     size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
     cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
     CPU_ZERO_S(setsize, cpus);
-    for (size_t i=0; i < node->n_cpus; ++i) {
+    for (size_t i = 0; i < node->n_cpus; ++i) {
         CPU_SET_S(node->cpus[i], setsize, cpus);
     }
-    int rv;
-    if ((rv = pthread_setaffinity_np(pthread_self(), setsize, cpus))) {
+    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    if (rv) {
             fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
                     strerror(rv));
     }
@@ -14077,15 +14081,15 @@ void set_numa_thread_affinity(int thread_n, int n_threads)
 }
 void clear_numa_thread_affinity(void)
 {
-    if (!ggml_is_numa()) return;
+    if (!ggml_is_numa()) { return; }
     size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
     cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
     CPU_ZERO_S(setsize, cpus);
-    for (unsigned i=0; i < ggml_numa.total_cpus; ++i) {
+    for (unsigned i = 0; i < ggml_numa.total_cpus; ++i) {
         CPU_SET_S(i, setsize, cpus);
     }
-    int rv;
-    if((rv = pthread_setaffinity_np(pthread_self(), setsize, cpus))) {
+    int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+    if (rv) {
         fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
             strerror(rv));
     }
diff --git a/llama-util.h b/llama-util.h
index 1f6c0d9cdcbcd..bb7155036116e 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -180,9 +180,9 @@ struct llama_mmap {
         int fd = fileno(file->fp);
         int flags = MAP_SHARED;
         // prefetch/readahead impairs performance on NUMA systems
-        if (ggml_is_numa()) prefetch = 0;
+        if (ggml_is_numa()) { prefetch = 0; }
 #ifdef __linux__
-        if (prefetch) flags |= MAP_POPULATE;
+        if (prefetch) { flags |= MAP_POPULATE; }
 #endif
         addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
         if (addr == MAP_FAILED) {

From 8502d5178eb1dd1900d69fa48f08c06724adff89 Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Tue, 23 May 2023 17:09:52 -0400
Subject: [PATCH 05/18] fix args

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 6017d9b394984..8cfe9d14dd3eb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14099,7 +14099,7 @@ void clear_numa_thread_affinity(void)
 // TODO: Windows etc.
 // (the linux implementation may also work on BSD, someone should test)
 void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
-void clear_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
+void clear_numa_thread_affinity() {}
 #endif
 
 struct ggml_compute_state_shared {

From bf83dcb279c5ff3795b178b5a1e155a138be0c53 Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Sat, 17 Jun 2023 15:03:14 -0400
Subject: [PATCH 06/18] make --numa a param

---
 examples/common.cpp    | 5 +++++
 examples/common.h      | 1 +
 examples/main/main.cpp | 4 ++++
 llama.cpp              | 1 -
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/examples/common.cpp b/examples/common.cpp
index 1308f84109519..aad21898d9cb3 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -288,6 +288,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
             params.use_mmap = false;
         } else if (arg == "--mtest") {
             params.mem_test = true;
+        } else if (arg == "--numa") {
+            params.numa = true;
         } else if (arg == "--verbose-prompt") {
             params.verbose_prompt = true;
         } else if (arg == "-r" || arg == "--reverse-prompt") {
@@ -421,6 +423,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     if (llama_mmap_supported()) {
         fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
     }
+    fprintf(stderr, "  --numa                attempt optimizations that help on some NUMA systems\n");
+    fprintf(stderr, "                        if run without this previously, it is recommended to drop the system page cache before using this\n");
+    fprintf(stderr, "                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
     fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
     fprintf(stderr, "                        number of layers to store in VRAM\n");
     fprintf(stderr, "  --mtest               compute maximum memory usage\n");
diff --git a/examples/common.h b/examples/common.h
index 2b66382a6a5e0..9d74bd7b84b58 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -70,6 +70,7 @@ struct gpt_params {
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
     bool mem_test          = false; // compute maximum memory usage
+    bool numa              = false; // attempt optimizations that help on some NUMA systems
     bool verbose_prompt    = false; // print prompt tokens before generation
 };
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 47b418d972bbc..100e9d65b0e28 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -5,6 +5,7 @@
 
 #include "common.h"
 #include "llama.h"
+#include "ggml.h"
 #include "build-info.h"
 
 #include <cassert>
@@ -97,6 +98,9 @@ int main(int argc, char ** argv) {
     }
 
     llama_init_backend();
+    if (params.numa) {
+        ggml_numa_init();
+    }
 
     llama_context * ctx;
     g_ctx = &ctx;
diff --git a/llama.cpp b/llama.cpp
index 468f96cc142d0..4cbc8d6b63752 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -851,7 +851,6 @@ bool llama_mlock_supported() {
 
 void llama_init_backend() {
     ggml_time_init();
-    ggml_numa_init();
 
     // needed to initialize f16 tables
     {

From b71dfe637fc3b2823aa346730eb2ab9bb2c3eeed Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Sat, 17 Jun 2023 15:11:05 -0400
Subject: [PATCH 07/18] recommendation that n_nodes evenly divide n_threads did
 not warrant such aggressive enforcement

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 8cfe9d14dd3eb..002b982bd6ace 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14064,7 +14064,7 @@ void set_numa_thread_affinity(int thread_n, int n_threads)
 {
     if (!ggml_is_numa()) { return; }
     // run thread on node_num thread_n / (threads per node)
-    int node_num = thread_n / (n_threads / ggml_numa.n_nodes);
+    int node_num = thread_n / ((n_threads + ggml_numa.n_nodes - 1) / ggml_numa.n_nodes);
     struct ggml_numa_node *node = &ggml_numa.nodes[node_num];
     size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
     cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);

From adaad10e97828da0d29f938b34ad6975cc23dbb7 Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Sun, 18 Jun 2023 02:03:41 -0400
Subject: [PATCH 08/18] lower synchronization overhead

---
 ggml.c | 339 +++++++++++++++++++--------------------------------------
 1 file changed, 110 insertions(+), 229 deletions(-)

diff --git a/ggml.c b/ggml.c
index 002b982bd6ace..3f7fdb2acd0d1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3702,12 +3702,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "f(x,y)",
 };
 
-// only send finalize op to thread pool if it actually does something
-// currently none of them?
-static const bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = {
-    0
-};
-
 static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -14099,75 +14093,114 @@ void clear_numa_thread_affinity(void)
 // TODO: Windows etc.
 // (the linux implementation may also work on BSD, someone should test)
 void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads);  }
-void clear_numa_thread_affinity() {}
+void clear_numa_thread_affinity(void) {}
 #endif
 
 struct ggml_compute_state_shared {
-    ggml_lock_t spin;
-
+    struct ggml_cgraph * cgraph;
+    int64_t perf_node_start_cycles;
+    int64_t perf_node_start_time_us;
     int n_threads;
 
     // synchronization primitives
-    atomic_int  n_ready;
-    atomic_bool has_work;
-    atomic_bool stop; // stop all threads
+    atomic_int n_active; // num active threads
+    atomic_int node_n; // active graph node
 };
 
 struct ggml_compute_state {
     ggml_thread_t thrd;
-
-    struct ggml_compute_params params;
-    struct ggml_tensor * node;
-
+    int ith;
     struct ggml_compute_state_shared * shared;
 };
 
+inline void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st)
+{
+    int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
+    int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
+
+    node->perf_runs++;
+    node->perf_cycles += cycles_cur;
+    node->perf_time_us += time_us_cur;
+}
+
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
-
+    struct ggml_cgraph * cgraph = state->shared->cgraph;
     const int n_threads = state->shared->n_threads;
-    set_numa_thread_affinity(state->params.ith, n_threads);
+    set_numa_thread_affinity(state->ith, n_threads);
 
+    int node_n = -1;
     while (true) {
-        if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
-            atomic_store(&state->shared->has_work, false);
-        } else {
-            while (atomic_load(&state->shared->has_work)) {
-                if (atomic_load(&state->shared->stop)) {
-                    return 0;
-                }
-                ggml_lock_lock  (&state->shared->spin);
-                ggml_lock_unlock(&state->shared->spin);
+        if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
+            // all other threads are finished and spinning
+            // do finalize and init here so we don't have synchronize again
+            struct ggml_compute_params params = {
+                /*.type  =*/ GGML_TASK_FINALIZE,
+                /*.ith   =*/ 0,
+                /*.nth   =*/ 0,
+                /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
+                /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+            };
+            if (node_n != -1) {
+                /* FINALIZE */
+                struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
+                params.nth = node->n_tasks;
+                ggml_compute_forward(&params, node);
+                ggml_graph_compute_perf_stats_node(node, state->shared);
             }
-        }
-
-        atomic_fetch_sub(&state->shared->n_ready, 1);
-
-        // wait for work
-        while (!atomic_load(&state->shared->has_work)) {
-            if (atomic_load(&state->shared->stop)) {
-                return 0;
+            // distribute new work or execute it direct if 1T
+            while (++node_n < cgraph->n_nodes) {
+                GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
+
+                struct ggml_tensor * node = cgraph->nodes[node_n];
+
+                state->shared->perf_node_start_cycles = ggml_perf_cycles();
+                state->shared->perf_node_start_time_us = ggml_perf_time_us();
+
+                /* INIT */
+                params.type = GGML_TASK_INIT;
+                params.nth = node->n_tasks;
+                ggml_compute_forward(&params, node);
+
+                if (node->n_tasks == 1) {
+                    // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
+                    // they do something more efficient than spinning (?)
+                    params.type = GGML_TASK_COMPUTE;
+                    ggml_compute_forward(&params, node);
+                    params.type = GGML_TASK_FINALIZE;
+                    ggml_compute_forward(&params, node);
+                    ggml_graph_compute_perf_stats_node(node, state->shared);
+                } else {
+                    break;
+                }
             }
-            ggml_lock_lock  (&state->shared->spin);
-            ggml_lock_unlock(&state->shared->spin);
+            atomic_store(&state->shared->n_active, n_threads);
+            atomic_store(&state->shared->node_n, node_n);
+        } else {
+            // wait for other threads to finish
+            const int last = node_n;
+            do {
+                sched_yield();
+                node_n = atomic_load(&state->shared->node_n);
+            } while (node_n == last);
         }
-
         // check if we should stop
-        if (atomic_load(&state->shared->stop)) {
-            break;
-        }
-
-        if (state->node) {
-            if (state->params.ith < state->params.nth) {
-                ggml_compute_forward(&state->params, state->node);
-            }
-
-            state->node = NULL;
+        if (node_n >= cgraph->n_nodes) break;
+        struct ggml_tensor * node = cgraph->nodes[node_n];
+        /* COMPUTE */
+        struct ggml_compute_params params = {
+            /*.type  =*/ GGML_TASK_COMPUTE,
+            /*.ith   =*/ state->ith,
+            /*.nth   =*/ node->n_tasks,
+            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
+            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
+        };
+        if(state->ith < node->n_tasks) {
+            ggml_compute_forward(&params, node);
         } else {
             break;
         }
     }
-
     return 0;
 }
 
@@ -14175,39 +14208,14 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
     const int n_threads = cgraph->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
-        /*.spin      =*/ GGML_LOCK_INITIALIZER,
+        /*.cgraph    =*/ cgraph,
+        /*.perf_node_start_cycles =*/ 0,
+        /*.perf_node_start_time_us =*/ 0,
         /*.n_threads =*/ n_threads,
-        /*.n_ready   =*/ 0,
-        /*.has_work  =*/ false,
-        /*.stop      =*/ false,
+        /*.n_active  =*/ n_threads,
+        /*.node_n    =*/ -1,
     };
-    struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;
-
-    // create thread pool
-    if (n_threads > 1) {
-        ggml_lock_init(&state_shared.spin);
-
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
-            workers[j] = (struct ggml_compute_state) {
-                .thrd   = 0,
-                .params = {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = n_threads,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                },
-                .node   = NULL,
-                .shared = &state_shared,
-            };
-
-            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
-            GGML_ASSERT(rc == 0);
-            UNUSED(rc);
-        }
-    }
+    struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
     // initialize tasks + work buffer
     {
@@ -14467,167 +14475,40 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
             cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
         }
     }
+   
+    // create thread pool
+    if (n_threads > 1) {
+        for (int j = 1; j < n_threads; ++j) {
+            workers[j] = (struct ggml_compute_state) {
+                .thrd   = 0,
+                .ith = j,
+                .shared = &state_shared,
+            };
 
-    const int64_t perf_start_cycles  = ggml_perf_cycles();
-    const int64_t perf_start_time_us = ggml_perf_time_us();
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, i, cgraph->n_nodes);
-
-        struct ggml_tensor * node = cgraph->nodes[i];
-
-        // TODO: this could be used to avoid unnecessary computations, but it needs to be improved
-        //if (node->grad == NULL && node->perf_runs > 0) {
-        //    continue;
-        //}
-
-        const int64_t perf_node_start_cycles  = ggml_perf_cycles();
-        const int64_t perf_node_start_time_us = ggml_perf_time_us();
-
-        // INIT
-        struct ggml_compute_params params = {
-            /*.type  =*/ GGML_TASK_INIT,
-            /*.ith   =*/ 0,
-            /*.nth   =*/ node->n_tasks,
-            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
-        };
-
-        ggml_compute_forward(&params, node);
-
-        // COMPUTE
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_COMPUTE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
-        }
-
-        params.type = GGML_TASK_COMPUTE;
-        ggml_compute_forward(&params, node);
-
-        // wait for thread pool
-        if (node->n_tasks > 1) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-        }
-
-        // FINALIZE
-        if (node->n_tasks > 1 && GGML_OP_HAS_FINALIZE[node->op]) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            // launch thread pool
-            for (int j = 0; j < n_threads - 1; j++) {
-                workers[j].params = (struct ggml_compute_params) {
-                    .type  = GGML_TASK_FINALIZE,
-                    .ith   = j + 1,
-                    .nth   = node->n_tasks,
-                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,
-                    .wdata = cgraph->work ? cgraph->work->data : NULL,
-                };
-                workers[j].node = node;
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) > 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_store(&state_shared.has_work, true);
+            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
+            GGML_ASSERT(rc == 0);
+            UNUSED(rc);
         }
+    }
+    workers[0].ith = 0;
+    workers[0].shared = &state_shared;
 
-        params.type = GGML_TASK_FINALIZE;
-        ggml_compute_forward(&params, node);
-
-        // wait for thread pool
-        if (node->n_tasks > 1 && GGML_OP_HAS_FINALIZE[node->op]) {
-            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
-                atomic_store(&state_shared.has_work, false);
-            }
-
-            while (atomic_load(&state_shared.has_work)) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-
-            atomic_fetch_sub(&state_shared.n_ready, 1);
-
-            while (atomic_load(&state_shared.n_ready) != 0) {
-                ggml_lock_lock  (&state_shared.spin);
-                ggml_lock_unlock(&state_shared.spin);
-            }
-        }
 
-        // performance stats (node)
-        {
-            int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_node_start_cycles;
-            int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;
+    const int64_t perf_start_cycles  = ggml_perf_cycles();
+    const int64_t perf_start_time_us = ggml_perf_time_us();
 
-            node->perf_runs++;
-            node->perf_cycles  += perf_cycles_cur;
-            node->perf_time_us += perf_time_us_cur;
-        }
-    }
+    // this is a work thread too
+    ggml_graph_compute_thread(&workers[0]);
+    // don't leave affinity set on the main thread
+    clear_numa_thread_affinity();
 
     // join thread pool
     if (n_threads > 1) {
-        atomic_store(&state_shared.stop, true);
-        atomic_store(&state_shared.has_work, true);
-
-        for (int j = 0; j < n_threads - 1; j++) {
+        for (int j = 1; j < n_threads; j++) {
             int rc = ggml_thread_join(workers[j].thrd, NULL);
             GGML_ASSERT(rc == 0);
             UNUSED(rc);
         }
-
-        ggml_lock_destroy(&state_shared.spin);
     }
 
     // performance stats (graph)

From c31d51d40d505160623d326dfb808fc0c3627fe5 Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Sun, 18 Jun 2023 02:34:08 -0400
Subject: [PATCH 09/18] statically allocate

---
 ggml.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/ggml.c b/ggml.c
index 3f7fdb2acd0d1..4fe0707c826ab 100644
--- a/ggml.c
+++ b/ggml.c
@@ -484,21 +484,23 @@ int64_t ggml_cycles_per_ms(void) {
 // NUMA support
 //
 
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
 struct ggml_numa_node
 {
-    uint32_t *cpus; // hardware threads on this node
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
     uint32_t n_cpus;
 };
 
 struct ggml_numa_nodes
 {
-    struct ggml_numa_node *nodes;
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
     uint32_t n_nodes;
     uint32_t total_cpus; // hardware threads on system
 };
 
 struct ggml_numa_nodes ggml_numa = {
-    .nodes = NULL,
     .n_nodes = 0,
     .total_cpus = 0,
 };
@@ -511,14 +513,14 @@ void ggml_numa_init(void)
     char path[256];
     int rv;
     // enumerate nodes
-    while (true) {
+    while (ggml_numa.n_nodes < GGML_NUMA_MAX_NODES) {
         rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", ggml_numa.n_nodes);
         GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
         if (stat(path, &st) != 0) { break; }
         ++ggml_numa.n_nodes;
     }
     // enumerate CPUs
-    while (true) {
+    while (ggml_numa.total_cpus < GGML_NUMA_MAX_CPUS) {
         rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", ggml_numa.total_cpus);
         GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
         if (stat(path, &st) != 0) { break; }
@@ -529,12 +531,8 @@ void ggml_numa_init(void)
         ggml_numa.n_nodes = 0;
         return;
     }
-    ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
-    GGML_ASSERT(ggml_numa.nodes != NULL);
     for (uint32_t n = 0; n < ggml_numa.n_nodes; ++n) {
         struct ggml_numa_node *node = &ggml_numa.nodes[n];
-        node->cpus = calloc(ggml_numa.total_cpus, sizeof(uint32_t));
-        GGML_ASSERT(node->cpus != NULL);
         GGML_PRINT_DEBUG("CPUs on node %u:", n);
         for (uint32_t c = 0; c < ggml_numa.total_cpus; ++c) {
             rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);

From 2f5bb462fd8179f484e6b04f119116f0a7bce8c4 Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Sun, 18 Jun 2023 11:59:27 -0400
Subject: [PATCH 10/18] move numa state to g_state

---
 ggml.c       | 176 ++++++++++++++++++++++++++-------------------------
 llama-util.h |   5 +-
 2 files changed, 91 insertions(+), 90 deletions(-)

diff --git a/ggml.c b/ggml.c
index 4fe0707c826ab..b76dc5e9a33ac 100644
--- a/ggml.c
+++ b/ggml.c
@@ -480,86 +480,6 @@ int64_t ggml_cycles_per_ms(void) {
 #define ggml_perf_cycles_per_ms() 0
 #endif
 
-//
-// NUMA support
-//
-
-#define GGML_NUMA_MAX_NODES 8
-#define GGML_NUMA_MAX_CPUS 512
-
-struct ggml_numa_node
-{
-    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
-    uint32_t n_cpus;
-};
-
-struct ggml_numa_nodes
-{
-    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
-    uint32_t n_nodes;
-    uint32_t total_cpus; // hardware threads on system
-};
-
-struct ggml_numa_nodes ggml_numa = {
-    .n_nodes = 0,
-    .total_cpus = 0,
-};
-
-void ggml_numa_init(void)
-{
-    if (ggml_numa.n_nodes > 0) { return; }
-#ifdef __linux__
-    struct stat st;
-    char path[256];
-    int rv;
-    // enumerate nodes
-    while (ggml_numa.n_nodes < GGML_NUMA_MAX_NODES) {
-        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", ggml_numa.n_nodes);
-        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) { break; }
-        ++ggml_numa.n_nodes;
-    }
-    // enumerate CPUs
-    while (ggml_numa.total_cpus < GGML_NUMA_MAX_CPUS) {
-        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", ggml_numa.total_cpus);
-        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-        if (stat(path, &st) != 0) { break; }
-        ++ggml_numa.total_cpus;
-    }
-    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", ggml_numa.n_nodes, ggml_numa.total_cpus);
-    if (ggml_numa.n_nodes < 1 || ggml_numa.total_cpus < 1) {
-        ggml_numa.n_nodes = 0;
-        return;
-    }
-    for (uint32_t n = 0; n < ggml_numa.n_nodes; ++n) {
-        struct ggml_numa_node *node = &ggml_numa.nodes[n];
-        GGML_PRINT_DEBUG("CPUs on node %u:", n);
-        for (uint32_t c = 0; c < ggml_numa.total_cpus; ++c) {
-            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
-            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
-            if (stat(path, &st) == 0) {
-                node->cpus[node->n_cpus++] = c;
-                GGML_PRINT_DEBUG(" %u", c);
-            }
-        }
-        GGML_PRINT_DEBUG("\n");
-    }
-    if (ggml_is_numa()) {
-        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
-        if (fptr != NULL) {
-            char buf[42];
-            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
-                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
-            }
-            fclose(fptr);
-        }
-    }
-#else
-    // TODO
-#endif
-}
-
-bool ggml_is_numa(void) { return ggml_numa.n_nodes > 1; }
 
 //
 // cache line
@@ -3750,12 +3670,33 @@ struct ggml_compute_params {
     void * wdata;
 };
 
+//
+// NUMA support
+//
+
+#define GGML_NUMA_MAX_NODES 8
+#define GGML_NUMA_MAX_CPUS 512
+
+struct ggml_numa_node
+{
+    uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
+    uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes
+{
+    struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
+    uint32_t n_nodes;
+    uint32_t total_cpus; // hardware threads on system
+};
+
 //
 // ggml state
 //
 
 struct ggml_state {
     struct ggml_context_container contexts[GGML_MAX_CONTEXTS];
+    struct ggml_numa_nodes numa;
 };
 
 // global state
@@ -3780,6 +3721,63 @@ inline static void ggml_critical_section_end(void) {
     atomic_fetch_sub(&g_state_barrier, 1);
 }
 
+void ggml_numa_init(void)
+{
+    if (g_state.numa.n_nodes > 0) { return; }
+#ifdef __linux__
+    struct stat st;
+    char path[256];
+    int rv;
+    // enumerate nodes
+    while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.n_nodes;
+    }
+    // enumerate CPUs
+    while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
+        rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
+        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+        if (stat(path, &st) != 0) { break; }
+        ++g_state.numa.total_cpus;
+    }
+    GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+    if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
+        g_state.numa.n_nodes = 0;
+        return;
+    }
+    for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
+        struct ggml_numa_node * node = &g_state.numa.nodes[n];
+        GGML_PRINT_DEBUG("CPUs on node %u:", n);
+        node->n_cpus = 0;
+        for (uint32_t c = 0; c < g_state.numa.total_cpus; ++c) {
+            rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
+            GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+            if (stat(path, &st) == 0) {
+                node->cpus[node->n_cpus++] = c;
+                GGML_PRINT_DEBUG(" %u", c);
+            }
+        }
+        GGML_PRINT_DEBUG("\n");
+    }
+    if (ggml_is_numa()) {
+        FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
+        if (fptr != NULL) {
+            char buf[42];
+            if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
+                GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
+            }
+            fclose(fptr);
+        }
+    }
+#else
+    // TODO
+#endif
+}
+
+bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; }
+
 ////////////////////////////////////////////////////////////////////////////////
 
 void ggml_print_object(const struct ggml_object * obj) {
@@ -3995,6 +3993,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
             g_state = (struct ggml_state) {
                 /*.contexts =*/ { { 0 } },
+                /*.numa =*/ {
+                    .n_nodes = 0,
+                    .total_cpus = 0,
+                },
             };
 
             for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {
@@ -14056,10 +14058,10 @@ void set_numa_thread_affinity(int thread_n, int n_threads)
 {
     if (!ggml_is_numa()) { return; }
     // run thread on node_num thread_n / (threads per node)
-    int node_num = thread_n / ((n_threads + ggml_numa.n_nodes - 1) / ggml_numa.n_nodes);
-    struct ggml_numa_node *node = &ggml_numa.nodes[node_num];
-    size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
-    cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
+    int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
+    struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
     CPU_ZERO_S(setsize, cpus);
     for (size_t i = 0; i < node->n_cpus; ++i) {
         CPU_SET_S(node->cpus[i], setsize, cpus);
@@ -14074,10 +14076,10 @@ void set_numa_thread_affinity(int thread_n, int n_threads)
 void clear_numa_thread_affinity(void)
 {
     if (!ggml_is_numa()) { return; }
-    size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
-    cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
+    size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+    cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
     CPU_ZERO_S(setsize, cpus);
-    for (unsigned i = 0; i < ggml_numa.total_cpus; ++i) {
+    for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
         CPU_SET_S(i, setsize, cpus);
     }
     int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
diff --git a/llama-util.h b/llama-util.h
index bb7155036116e..3d47be0735a50 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -16,6 +16,8 @@
 #include <vector>
 #include <stdexcept>
 
+#include "ggml.h"
+
 #ifdef __has_include
     #if __has_include(<unistd.h>)
         #include <unistd.h>
@@ -163,9 +165,6 @@ static std::string llama_format_win_err(DWORD err) {
 }
 #endif
 
-extern "C" {
-bool ggml_is_numa();
-}
 struct llama_mmap {
     void * addr;
     size_t size;

From 4b9458215bdb45cd97f633ff974eb35c4c8ea352 Mon Sep 17 00:00:00 2001
From: zrm <trustiosity.zrm@gmail.com>
Date: Sun, 18 Jun 2023 12:36:35 -0400
Subject: [PATCH 11/18] add description for --numa

---
 examples/main/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/main/README.md b/examples/main/README.md
index 7c03f92c897d9..7e9d81c29e012 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -262,6 +262,10 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
 
+### NUMA support
+
+-   `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
+
 ### Memory Float 32
 
 -   `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.

From d0e35963504a22c9549759a566ef0b64e2f8939c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jun 2023 18:45:36 +0300
Subject: [PATCH 12/18] ggml : minor style changes

---
 ggml.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/ggml.c b/ggml.c
index 06a7198f99640..2d7761cc92cf7 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16342,6 +16342,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
                 /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
             };
+
             if (node_n != -1) {
                 /* FINALIZE */
                 struct ggml_tensor * node = state->shared->cgraph->nodes[node_n];
@@ -16349,18 +16350,19 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 ggml_compute_forward(&params, node);
                 ggml_graph_compute_perf_stats_node(node, state->shared);
             }
+
             // distribute new work or execute it direct if 1T
             while (++node_n < cgraph->n_nodes) {
                 GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
 
                 struct ggml_tensor * node = cgraph->nodes[node_n];
 
-                state->shared->perf_node_start_cycles = ggml_perf_cycles();
+                state->shared->perf_node_start_cycles  = ggml_perf_cycles();
                 state->shared->perf_node_start_time_us = ggml_perf_time_us();
 
                 /* INIT */
                 params.type = GGML_TASK_INIT;
-                params.nth = node->n_tasks;
+                params.nth  = node->n_tasks;
                 ggml_compute_forward(&params, node);
 
                 if (node->n_tasks == 1) {
@@ -16368,6 +16370,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                     // they do something more efficient than spinning (?)
                     params.type = GGML_TASK_COMPUTE;
                     ggml_compute_forward(&params, node);
+
                     params.type = GGML_TASK_FINALIZE;
                     ggml_compute_forward(&params, node);
                     ggml_graph_compute_perf_stats_node(node, state->shared);
@@ -16375,6 +16378,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                     break;
                 }
             }
+
             atomic_store(&state->shared->n_active, n_threads);
             atomic_store(&state->shared->node_n, node_n);
         } else {
@@ -16387,8 +16391,9 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
         }
         // check if we should stop
         if (node_n >= cgraph->n_nodes) break;
-        struct ggml_tensor * node = cgraph->nodes[node_n];
+
         /* COMPUTE */
+        struct ggml_tensor * node = cgraph->nodes[node_n];
         struct ggml_compute_params params = {
             /*.type  =*/ GGML_TASK_COMPUTE,
             /*.ith   =*/ state->ith,
@@ -16396,12 +16401,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,
             /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
         };
+
         if(state->ith < node->n_tasks) {
             ggml_compute_forward(&params, node);
         } else {
             break;
         }
     }
+
     return 0;
 }
 
@@ -16409,12 +16416,12 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
     const int n_threads = cgraph->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
-        /*.cgraph    =*/ cgraph,
-        /*.perf_node_start_cycles =*/ 0,
+        /*.cgraph                  =*/ cgraph,
+        /*.perf_node_start_cycles  =*/ 0,
         /*.perf_node_start_time_us =*/ 0,
-        /*.n_threads =*/ n_threads,
-        /*.n_active  =*/ n_threads,
-        /*.node_n    =*/ -1,
+        /*.n_threads               =*/ n_threads,
+        /*.n_active                =*/ n_threads,
+        /*.node_n                  =*/ -1,
     };
     struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads);
 
@@ -16760,7 +16767,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
             cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);
         }
     }
-   
+
     // create thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; ++j) {
@@ -16778,7 +16785,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
     workers[0].ith = 0;
     workers[0].shared = &state_shared;
 
-
     const int64_t perf_start_cycles  = ggml_perf_cycles();
     const int64_t perf_start_time_us = ggml_perf_time_us();
 

From 67ba34e88feabbba43db5932ee7f05ac56c3b575 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 19 Jun 2023 18:55:09 +0300
Subject: [PATCH 13/18] ggml : minor style + try fix sanitizer build

---
 ggml.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/ggml.c b/ggml.c
index 2d7761cc92cf7..496d9bbcadf50 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16299,13 +16299,15 @@ void clear_numa_thread_affinity(void) {}
 
 struct ggml_compute_state_shared {
     struct ggml_cgraph * cgraph;
+
     int64_t perf_node_start_cycles;
     int64_t perf_node_start_time_us;
+
     int n_threads;
 
     // synchronization primitives
     atomic_int n_active; // num active threads
-    atomic_int node_n; // active graph node
+    atomic_int node_n;   // active graph node
 };
 
 struct ggml_compute_state {
@@ -16314,13 +16316,12 @@ struct ggml_compute_state {
     struct ggml_compute_state_shared * shared;
 };
 
-inline void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st)
-{
-    int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles;
+static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) {
+    int64_t cycles_cur  = ggml_perf_cycles()  - st->perf_node_start_cycles;
     int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us;
 
     node->perf_runs++;
-    node->perf_cycles += cycles_cur;
+    node->perf_cycles  += cycles_cur;
     node->perf_time_us += time_us_cur;
 }
 

From 0fe4b00de249194c134b72fd7a89c0550c4e84b7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 26 Jun 2023 20:24:17 +0300
Subject: [PATCH 14/18] llama : allow to initialize backend with NUMA support

---
 examples/embedding/embedding.cpp   |  2 +-
 examples/main/main.cpp             |  6 +-----
 examples/perplexity/perplexity.cpp |  2 +-
 examples/quantize/quantize.cpp     |  2 +-
 examples/simple/simple.cpp         |  2 +-
 ggml.c                             | 26 ++++++++++++++++++--------
 llama.cpp                          |  6 +++++-
 llama.h                            |  3 ++-
 8 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 369eac1d1c391..3cd5bb794957c 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index c7193627afacf..bcdc98d611250 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -5,7 +5,6 @@
 
 #include "common.h"
 #include "llama.h"
-#include "ggml.h"
 #include "build-info.h"
 
 #include <cassert>
@@ -106,10 +105,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend();
-    if (params.numa) {
-        ggml_numa_init();
-    }
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index b59f5971e3dd2..f8a6cb516d067 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
         params.prompt = gpt_random_prompt(rng);
     }
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 4e8e6f5239c05..1eb0f75d6dc79 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
         usage(argv[0]);
     }
 
-    llama_init_backend();
+    llama_init_backend(false);
 
     // parse command line arguments
     const std::string fname_inp = argv[arg_idx];
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index fc45c93406bc4..2d913cebb813a 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
     // Init LLM :
     //---------------------------------
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     llama_model * model;
     llama_context * ctx;
diff --git a/ggml.c b/ggml.c
index 7ff6254c5486c..df8370960be6b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3879,14 +3879,12 @@ struct ggml_context_container {
 #define GGML_NUMA_MAX_NODES 8
 #define GGML_NUMA_MAX_CPUS 512
 
-struct ggml_numa_node
-{
+struct ggml_numa_node {
     uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
     uint32_t n_cpus;
 };
 
-struct ggml_numa_nodes
-{
+struct ggml_numa_nodes {
     struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
     uint32_t n_nodes;
     uint32_t total_cpus; // hardware threads on system
@@ -3923,13 +3921,18 @@ inline static void ggml_critical_section_end(void) {
     atomic_fetch_sub(&g_state_barrier, 1);
 }
 
-void ggml_numa_init(void)
-{
-    if (g_state.numa.n_nodes > 0) { return; }
+void ggml_numa_init(void) {
+    if (g_state.numa.n_nodes > 0) {
+        fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
+
+        return;
+    }
+
 #ifdef __linux__
     struct stat st;
     char path[256];
     int rv;
+
     // enumerate nodes
     while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
         rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
@@ -3937,6 +3940,7 @@ void ggml_numa_init(void)
         if (stat(path, &st) != 0) { break; }
         ++g_state.numa.n_nodes;
     }
+
     // enumerate CPUs
     while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
         rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
@@ -3944,11 +3948,14 @@ void ggml_numa_init(void)
         if (stat(path, &st) != 0) { break; }
         ++g_state.numa.total_cpus;
     }
+
     GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+
     if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
         g_state.numa.n_nodes = 0;
         return;
     }
+
     for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
         struct ggml_numa_node * node = &g_state.numa.nodes[n];
         GGML_PRINT_DEBUG("CPUs on node %u:", n);
@@ -3963,6 +3970,7 @@ void ggml_numa_init(void)
         }
         GGML_PRINT_DEBUG("\n");
     }
+
     if (ggml_is_numa()) {
         FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
         if (fptr != NULL) {
@@ -3978,7 +3986,9 @@ void ggml_numa_init(void)
 #endif
 }
 
-bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; }
+bool ggml_is_numa(void) {
+    return g_state.numa.n_nodes > 1;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 
diff --git a/llama.cpp b/llama.cpp
index c41c2a8a32992..e932636fc2a36 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -977,7 +977,7 @@ bool llama_mlock_supported() {
     return llama_mlock::SUPPORTED;
 }
 
-void llama_init_backend() {
+void llama_init_backend(bool numa) {
     ggml_time_init();
 
     // needed to initialize f16 tables
@@ -986,6 +986,10 @@ void llama_init_backend() {
         struct ggml_context * ctx = ggml_init(params);
         ggml_free(ctx);
     }
+
+    if (numa) {
+        ggml_numa_init();
+    }
 }
 
 int64_t llama_time_us() {
diff --git a/llama.h b/llama.h
index a833a7f4d66cc..76239be25fc22 100644
--- a/llama.h
+++ b/llama.h
@@ -140,8 +140,9 @@ extern "C" {
 
     // TODO: not great API - very likely to change
     // Initialize the llama + ggml backend
+    // If numa is true, use NUMA optimizations
     // Call once at the start of the program
-    LLAMA_API void llama_init_backend();
+    LLAMA_API void llama_init_backend(bool numa);
 
     LLAMA_API int64_t llama_time_us();
 

From 875a1e111eaa9db3fd51be8c3b3288291ec2f1d2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 26 Jun 2023 20:27:24 +0300
Subject: [PATCH 15/18] llama : avoid ggml include in llama-util.h

---
 llama-util.h | 18 ++++++++++--------
 llama.cpp    |  4 ++--
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/llama-util.h b/llama-util.h
index d709319dffd86..042ebe43c48e1 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -16,8 +16,6 @@
 #include <vector>
 #include <stdexcept>
 
-#include "ggml.h"
-
 #ifdef __has_include
     #if __has_include(<unistd.h>)
         #include <unistd.h>
@@ -174,12 +172,12 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
     static constexpr bool SUPPORTED = true;
 
-    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
         size = file->size;
         int fd = fileno(file->fp);
         int flags = MAP_SHARED;
         // prefetch/readahead impairs performance on NUMA systems
-        if (ggml_is_numa()) { prefetch = 0; }
+        if (numa) { prefetch = 0; }
 #ifdef __linux__
         if (prefetch) { flags |= MAP_POPULATE; }
 #endif
@@ -195,7 +193,7 @@ struct llama_mmap {
                         strerror(errno));
             }
         }
-        if (ggml_is_numa()) {
+        if (numa) {
             // advise the kernel not to use readahead
             // (because the next page might not belong on the same node)
             if (madvise(addr, file->size, MADV_RANDOM)) {
@@ -211,7 +209,9 @@ struct llama_mmap {
 #elif defined(_WIN32)
     static constexpr bool SUPPORTED = true;
 
-    llama_mmap(struct llama_file * file, bool prefetch = true) {
+    llama_mmap(struct llama_file * file, bool prefetch = true, bool numa = false) {
+        (void) numa;
+
         size = file->size;
 
         HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
@@ -256,8 +256,10 @@ struct llama_mmap {
 #else
     static constexpr bool SUPPORTED = false;
 
-    llama_mmap(struct llama_file *, bool prefetch = true) {
-        (void)prefetch;
+    llama_mmap(struct llama_file *, bool prefetch = true, bool numa = false) {
+        (void) prefetch;
+        (void) numa;
+
         throw std::runtime_error(std::string("mmap not supported"));
     }
 #endif
diff --git a/llama.cpp b/llama.cpp
index e932636fc2a36..1a15844bcc7a4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -774,7 +774,7 @@ struct llama_model_loader {
         }
 
         if (use_mmap) {
-            mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
+            mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size, ggml_is_numa()));
             if (lmlock) {
                 lmlock->init(mapping->addr);
             }
@@ -2903,7 +2903,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
 
         // maybe this should in llama_model_loader
         if (model_loader->use_mmap) {
-            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
+            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0, ggml_is_numa()));
         }
     }
 

From 4a555b453904e0bb69663609b780f17ab76a947b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 26 Jun 2023 20:37:55 +0300
Subject: [PATCH 16/18] ggml : style / formatting

---
 ggml.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/ggml.c b/ggml.c
index df8370960be6b..899196c47f997 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16626,39 +16626,50 @@ typedef pthread_t ggml_thread_t;
 #endif
 
 #ifdef __linux__
-void set_numa_thread_affinity(int thread_n, int n_threads)
-{
-    if (!ggml_is_numa()) { return; }
+void set_numa_thread_affinity(int thread_n, int n_threads) {
+    if (!ggml_is_numa()) {
+        return;
+    }
+
     // run thread on node_num thread_n / (threads per node)
-    int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
+    const int node_num = thread_n / ((n_threads + g_state.numa.n_nodes - 1) / g_state.numa.n_nodes);
     struct ggml_numa_node * node = &g_state.numa.nodes[node_num];
     size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+
     cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
     CPU_ZERO_S(setsize, cpus);
     for (size_t i = 0; i < node->n_cpus; ++i) {
         CPU_SET_S(node->cpus[i], setsize, cpus);
     }
+
     int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
     if (rv) {
             fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
                     strerror(rv));
     }
+
     CPU_FREE(cpus);
 }
-void clear_numa_thread_affinity(void)
-{
-    if (!ggml_is_numa()) { return; }
+
+void clear_numa_thread_affinity(void) {
+    if (!ggml_is_numa()) {
+        return;
+    }
+
     size_t setsize = CPU_ALLOC_SIZE(g_state.numa.total_cpus);
+
     cpu_set_t * cpus = CPU_ALLOC(g_state.numa.total_cpus);
     CPU_ZERO_S(setsize, cpus);
     for (unsigned i = 0; i < g_state.numa.total_cpus; ++i) {
         CPU_SET_S(i, setsize, cpus);
     }
+
     int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
     if (rv) {
         fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
             strerror(rv));
     }
+
     CPU_FREE(cpus);
 }
 #else
@@ -16699,10 +16710,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 static thread_ret_t ggml_graph_compute_thread(void * data) {
     struct ggml_compute_state * state = (struct ggml_compute_state *) data;
     struct ggml_cgraph * cgraph = state->shared->cgraph;
+
     const int n_threads = state->shared->n_threads;
     set_numa_thread_affinity(state->ith, n_threads);
 
     int node_n = -1;
+
     while (true) {
         if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
             // all other threads are finished and spinning
@@ -17165,6 +17178,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
 
     // this is a work thread too
     ggml_graph_compute_thread(&workers[0]);
+
     // don't leave affinity set on the main thread
     clear_numa_thread_affinity();
 

From 81a40e9d6176a1c40202e3705e3e1f14248ca4b2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 26 Jun 2023 20:50:50 +0300
Subject: [PATCH 17/18] ggml : fix handling of ops with n_threads > n_tasks > 1

---
 ggml.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/ggml.c b/ggml.c
index 899196c47f997..4d51e31ed45c1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16765,7 +16765,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             }
 
             atomic_store(&state->shared->n_active, n_threads);
-            atomic_store(&state->shared->node_n, node_n);
+            atomic_store(&state->shared->node_n,   node_n);
         } else {
             // wait for other threads to finish
             const int last = node_n;
@@ -16774,11 +16774,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
                 node_n = atomic_load(&state->shared->node_n);
             } while (node_n == last);
         }
+
         // check if we should stop
         if (node_n >= cgraph->n_nodes) break;
 
         /* COMPUTE */
         struct ggml_tensor * node = cgraph->nodes[node_n];
+
         struct ggml_compute_params params = {
             /*.type  =*/ GGML_TASK_COMPUTE,
             /*.ith   =*/ state->ith,
@@ -16787,10 +16789,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
             /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,
         };
 
-        if(state->ith < node->n_tasks) {
+        if (state->ith < node->n_tasks) {
             ggml_compute_forward(&params, node);
-        } else {
-            break;
         }
     }
 
@@ -16952,7 +16952,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_SCALE:
                     {
-                        node->n_tasks = n_threads;
+                        node->n_tasks = 1;
                     } break;
                 case GGML_OP_SET:
                 case GGML_OP_CONT:
@@ -17165,9 +17165,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 .shared = &state_shared,
             };
 
-            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
+            const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);
             GGML_ASSERT(rc == 0);
-            UNUSED(rc);
         }
     }
     workers[0].ith = 0;
@@ -17185,9 +17184,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
     // join thread pool
     if (n_threads > 1) {
         for (int j = 1; j < n_threads; j++) {
-            int rc = ggml_thread_join(workers[j].thrd, NULL);
+            const int rc = ggml_thread_join(workers[j].thrd, NULL);
             GGML_ASSERT(rc == 0);
-            UNUSED(rc);
         }
     }
 

From 9aec2b74bdfd553669a1c405a716339d605e5798 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 26 Jun 2023 20:53:55 +0300
Subject: [PATCH 18/18] server : utilize numa parameter

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 79df5e84762cd..998d55eacff79 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -789,7 +789,7 @@ int main(int argc, char ** argv) {
         params.model_alias = params.model;
     }
 
-    llama_init_backend();
+    llama_init_backend(params.numa);
 
     LOG_INFO("build info", {
         { "build", BUILD_NUMBER },