ggerganov · ggerganov · Jun 26, 2023 · May 21, 2023 · May 21, 2023 · May 21, 2023
diff --git a/ggml.c b/ggml.c
@@ -76,6 +76,11 @@ static int sched_yield (void) {
 #include <stdatomic.h>
 
 typedef void* thread_ret_t;
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
 #endif
 
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
@@ -103,6 +108,30 @@ typedef void* thread_ret_t;
 #define GGML_SOFT_MAX_UNROLL 4
 #define GGML_VEC_DOT_UNROLL 2
 
+//
+// logging
+//
+
+#if (GGML_DEBUG >= 1)
+#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG(...)
+#endif
+
+#if (GGML_DEBUG >= 5)
+#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_5(...)
+#endif
+
+#if (GGML_DEBUG >= 10)
+#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
+#else
+#define GGML_PRINT_DEBUG_10(...)
+#endif
+
+#define GGML_PRINT(...) printf(__VA_ARGS__)
+
 #ifdef GGML_USE_ACCELERATE
 // uncomment to use vDSP for soft max computation
 // note: not sure if it is actually faster
@@ -395,7 +424,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
  }
 }
 
-
 //
 // timing
 //
@@ -452,6 +480,89 @@ int64_t ggml_cycles_per_ms(void) {
 #define ggml_perf_cycles_per_ms() 0
 #endif
 
+//
+// NUMA support
+//
+
+struct ggml_numa_node
+{
+ uint32_t *cpus; // hardware threads on this node
+ uint32_t n_cpus;
+};
+
+struct ggml_numa_nodes
+{
+ struct ggml_numa_node *nodes;
+ uint32_t n_nodes;
+ uint32_t total_cpus; // hardware threads on system
+};
+
+struct ggml_numa_nodes ggml_numa = {
+ .nodes = NULL,
+ .n_nodes = 0,
+ .total_cpus = 0,
+};
+
+void ggml_numa_init(void)
+{
+ if (ggml_numa.n_nodes > 0) { return; }
+#ifdef __linux__
+ struct stat st;
+ char path[256];
+ int rv;
+ // enumerate nodes
+ while (true) {
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", ggml_numa.n_nodes);
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+ if (stat(path, &st) != 0) { break; }
+ ++ggml_numa.n_nodes;
+ }
+ // enumerate CPUs
+ while (true) {
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", ggml_numa.total_cpus);
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+ if (stat(path, &st) != 0) { break; }
+ ++ggml_numa.total_cpus;
+ }
+ GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", ggml_numa.n_nodes, ggml_numa.total_cpus);
+ if (ggml_numa.n_nodes < 1 || ggml_numa.total_cpus < 1) {
+ ggml_numa.n_nodes = 0;
+ return;
+ }
+ ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
+ GGML_ASSERT(ggml_numa.nodes != NULL);
+ for (uint32_t n = 0; n < ggml_numa.n_nodes; ++n) {
+ struct ggml_numa_node *node = &ggml_numa.nodes[n];
+ node->cpus = calloc(ggml_numa.total_cpus, sizeof(uint32_t));
+ GGML_ASSERT(node->cpus != NULL);
+ GGML_PRINT_DEBUG("CPUs on node %u:", n);
+ for (uint32_t c = 0; c < ggml_numa.total_cpus; ++c) {
+ rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
+ GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
+ if (stat(path, &st) == 0) {
+ node->cpus[node->n_cpus++] = c;
+ GGML_PRINT_DEBUG(" %u", c);
+ }
+ }
+ GGML_PRINT_DEBUG("\n");
+ }
+ if (ggml_is_numa()) {
+ FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
+ if (fptr != NULL) {
+ char buf[42];
+ if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
+ GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
+ }
+ fclose(fptr);
+ }
+ }
+#else
+ // TODO
+#endif
+}
+
+bool ggml_is_numa(void) { return ggml_numa.n_nodes > 1; }
+
 //
 // cache line
 //
@@ -3405,30 +3516,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
  *s = 1.f/(*s);
 }
 
-//
-// logging
-//
-
-#if (GGML_DEBUG >= 1)
-#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG(...)
-#endif
-
-#if (GGML_DEBUG >= 5)
-#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_5(...)
-#endif
-
-#if (GGML_DEBUG >= 10)
-#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
-#else
-#define GGML_PRINT_DEBUG_10(...)
-#endif
-
-#define GGML_PRINT(...) printf(__VA_ARGS__)
-
 //
 // data types
 //
@@ -3615,6 +3702,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
  "f(x,y)",
 };
 
+// only send finalize op to thread pool if it actually does something
+// currently none of them?
+static const bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = {
+ 0
+};
+
 static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");
 
 static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
@@ -13966,6 +14059,49 @@ typedef pthread_t ggml_thread_t;
 
 #endif
 
+#ifdef __linux__
+void set_numa_thread_affinity(int thread_n, int n_threads)
+{
+ if (!ggml_is_numa()) { return; }
+ // run thread on node_num thread_n / (threads per node)
+ int node_num = thread_n / (n_threads / ggml_numa.n_nodes);
+ struct ggml_numa_node *node = &ggml_numa.nodes[node_num];
+ size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
+ cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
+ CPU_ZERO_S(setsize, cpus);
+ for (size_t i = 0; i < node->n_cpus; ++i) {
+ CPU_SET_S(node->cpus[i], setsize, cpus);
+ }
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+ if (rv) {
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
+ strerror(rv));
+ }
+ CPU_FREE(cpus);
+}
+void clear_numa_thread_affinity(void)
+{
+ if (!ggml_is_numa()) { return; }
+ size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
+ cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
+ CPU_ZERO_S(setsize, cpus);
+ for (unsigned i = 0; i < ggml_numa.total_cpus; ++i) {
+ CPU_SET_S(i, setsize, cpus);
+ }
+ int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
+ if (rv) {
+ fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
+ strerror(rv));
+ }
+ CPU_FREE(cpus);
+}
+#else
+// TODO: Windows etc.
+// (the linux implementation may also work on BSD, someone should test)
+void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
+void clear_numa_thread_affinity() {}
+#endif
+
 struct ggml_compute_state_shared {
  ggml_lock_t spin;
 
@@ -13990,6 +14126,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
  struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 
  const int n_threads = state->shared->n_threads;
+ set_numa_thread_affinity(state->params.ith, n_threads);
 
  while (true) {
  if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
@@ -14414,7 +14551,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
  }
 
  // FINALIZE
- if (node->n_tasks > 1) {
+ if (node->n_tasks > 1 && GGML_OP_HAS_FINALIZE[node->op]) {
  if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
  atomic_store(&state_shared.has_work, false);
  }
@@ -14450,7 +14587,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
  ggml_compute_forward(&params, node);
 
  // wait for thread pool
- if (node->n_tasks > 1) {
+ if (node->n_tasks > 1 && GGML_OP_HAS_FINALIZE[node->op]) {
  if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
  atomic_store(&state_shared.has_work, false);
  }

diff --git a/ggml.h b/ggml.h
@@ -417,6 +417,9 @@ extern "C" {
  GGML_API int64_t ggml_cycles(void);
  GGML_API int64_t ggml_cycles_per_ms(void);
 
+ GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
+ GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
  GGML_API void ggml_print_object (const struct ggml_object * obj);
  GGML_API void ggml_print_objects(const struct ggml_context * ctx);
 

diff --git a/llama-util.h b/llama-util.h
@@ -163,6 +163,9 @@ static std::string llama_format_win_err(DWORD err) {
 }
 #endif
 
+extern "C" {
+bool ggml_is_numa();
+}
 struct llama_mmap {
  void * addr;
  size_t size;
@@ -176,8 +179,10 @@ struct llama_mmap {
  size = file->size;
  int fd = fileno(file->fp);
  int flags = MAP_SHARED;
+ // prefetch/readahead impairs performance on NUMA systems
+ if (ggml_is_numa()) { prefetch = 0; }
 #ifdef __linux__
- flags |= MAP_POPULATE;
+ if (prefetch) { flags |= MAP_POPULATE; }
 #endif
  addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
  if (addr == MAP_FAILED) {
@@ -191,6 +196,14 @@ struct llama_mmap {
  strerror(errno));
  }
  }
+ if (ggml_is_numa()) {
+ // advise the kernel not to use readahead
+ // (because the next page might not belong on the same node)
+ if (madvise(addr, file->size, MADV_RANDOM)) {
+ fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
+ strerror(errno));
+ }
+ }
  }
 
  ~llama_mmap() {

diff --git a/llama.cpp b/llama.cpp
@@ -851,6 +851,7 @@ bool llama_mlock_supported() {
 
 void llama_init_backend() {
  ggml_time_init();
+ ggml_numa_init();
 
  // needed to initialize f16 tables
  {