llama : allow to initialize backend with NUMA support

ggerganov · ggerganov · Jun 26, 2023 · May 21, 2023 · May 21, 2023 · May 21, 2023
commit 0fe4b00de249194c134b72fd7a89c0550c4e84b7
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
  params.prompt = gpt_random_prompt(rng);
  }
 
- llama_init_backend();
+ llama_init_backend(params.numa);
 
  llama_model * model;
  llama_context * ctx;

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -5,7 +5,6 @@
 
 #include "common.h"
 #include "llama.h"
-#include "ggml.h"
 #include "build-info.h"
 
 #include <cassert>
@@ -106,10 +105,7 @@ int main(int argc, char ** argv) {
  params.prompt = gpt_random_prompt(rng);
  }
 
- llama_init_backend();
- if (params.numa) {
- ggml_numa_init();
- }
+ llama_init_backend(params.numa);
 
  llama_model * model;
  llama_context * ctx;

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
  params.prompt = gpt_random_prompt(rng);
  }
 
- llama_init_backend();
+ llama_init_backend(params.numa);
 
  llama_model * model;
  llama_context * ctx;

diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
@@ -180,7 +180,7 @@ int main(int argc, char ** argv) {
  usage(argv[0]);
  }
 
- llama_init_backend();
+ llama_init_backend(false);
 
  // parse command line arguments
  const std::string fname_inp = argv[arg_idx];

diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -66,7 +66,7 @@ int main(int argc, char ** argv)
  // Init LLM :
  //---------------------------------
 
- llama_init_backend();
+ llama_init_backend(params.numa);
 
  llama_model * model;
  llama_context * ctx;

diff --git a/ggml.c b/ggml.c
@@ -3879,14 +3879,12 @@ struct ggml_context_container {
 #define GGML_NUMA_MAX_NODES 8
 #define GGML_NUMA_MAX_CPUS 512
 
-struct ggml_numa_node
-{
+struct ggml_numa_node {
  uint32_t cpus[GGML_NUMA_MAX_CPUS]; // hardware threads on this node
  uint32_t n_cpus;
 };
 
-struct ggml_numa_nodes
-{
+struct ggml_numa_nodes {
  struct ggml_numa_node nodes[GGML_NUMA_MAX_NODES];
  uint32_t n_nodes;
  uint32_t total_cpus; // hardware threads on system
@@ -3923,32 +3921,41 @@ inline static void ggml_critical_section_end(void) {
  atomic_fetch_sub(&g_state_barrier, 1);
 }
 
-void ggml_numa_init(void)
-{
- if (g_state.numa.n_nodes > 0) { return; }
+void ggml_numa_init(void) {
+ if (g_state.numa.n_nodes > 0) {
+ fprintf(stderr, "ggml_numa_init: NUMA already initialized\n");
+
+ return;
+ }
+
 #ifdef __linux__
  struct stat st;
  char path[256];
  int rv;
+
  // enumerate nodes
  while (g_state.numa.n_nodes < GGML_NUMA_MAX_NODES) {
  rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", g_state.numa.n_nodes);
  GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
  if (stat(path, &st) != 0) { break; }
  ++g_state.numa.n_nodes;
  }
+
  // enumerate CPUs
  while (g_state.numa.total_cpus < GGML_NUMA_MAX_CPUS) {
  rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", g_state.numa.total_cpus);
  GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
  if (stat(path, &st) != 0) { break; }
  ++g_state.numa.total_cpus;
  }
+
  GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", g_state.numa.n_nodes, g_state.numa.total_cpus);
+
  if (g_state.numa.n_nodes < 1 || g_state.numa.total_cpus < 1) {
  g_state.numa.n_nodes = 0;
  return;
  }
+
  for (uint32_t n = 0; n < g_state.numa.n_nodes; ++n) {
  struct ggml_numa_node * node = &g_state.numa.nodes[n];
  GGML_PRINT_DEBUG("CPUs on node %u:", n);
@@ -3963,6 +3970,7 @@ void ggml_numa_init(void)
  }
  GGML_PRINT_DEBUG("\n");
  }
+
  if (ggml_is_numa()) {
  FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
  if (fptr != NULL) {
@@ -3978,7 +3986,9 @@ void ggml_numa_init(void)
 #endif
 }
 
-bool ggml_is_numa(void) { return g_state.numa.n_nodes > 1; }
+bool ggml_is_numa(void) {
+ return g_state.numa.n_nodes > 1;
+}
 
 ////////////////////////////////////////////////////////////////////////////////
 

diff --git a/llama.cpp b/llama.cpp
@@ -977,7 +977,7 @@ bool llama_mlock_supported() {
  return llama_mlock::SUPPORTED;
 }
 
-void llama_init_backend() {
+void llama_init_backend(bool numa) {
  ggml_time_init();
 
  // needed to initialize f16 tables
@@ -986,6 +986,10 @@ void llama_init_backend() {
  struct ggml_context * ctx = ggml_init(params);
  ggml_free(ctx);
  }
+
+ if (numa) {
+ ggml_numa_init();
+ }
 }
 
 int64_t llama_time_us() {

diff --git a/llama.h b/llama.h
@@ -140,8 +140,9 @@ extern "C" {
 
  // TODO: not great API - very likely to change
  // Initialize the llama + ggml backend
+ // If numa is true, use NUMA optimizations
  // Call once at the start of the program
- LLAMA_API void llama_init_backend();
+ LLAMA_API void llama_init_backend(bool numa);
 
  LLAMA_API int64_t llama_time_us();