Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Numa #1556

Merged
merged 20 commits into from
Jun 26, 2023
Merged

Numa #1556

Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 164 additions & 27 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ static int sched_yield (void) {
#include <stdatomic.h>

typedef void* thread_ret_t;

#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>

#endif

// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
Expand Down Expand Up @@ -103,6 +108,30 @@ typedef void* thread_ret_t;
#define GGML_SOFT_MAX_UNROLL 4
#define GGML_VEC_DOT_UNROLL 2

//
// logging
//

#if (GGML_DEBUG >= 1)
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG(...)
#endif

#if (GGML_DEBUG >= 5)
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_5(...)
#endif

#if (GGML_DEBUG >= 10)
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_10(...)
#endif

#define GGML_PRINT(...) printf(__VA_ARGS__)

#ifdef GGML_USE_ACCELERATE
// uncomment to use vDSP for soft max computation
// note: not sure if it is actually faster
Expand Down Expand Up @@ -395,7 +424,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
}
}


//
// timing
//
Expand Down Expand Up @@ -452,6 +480,89 @@ int64_t ggml_cycles_per_ms(void) {
#define ggml_perf_cycles_per_ms() 0
#endif

//
// NUMA support
//

struct ggml_numa_node
{
uint32_t *cpus; // hardware threads on this node
uint32_t n_cpus;
};

struct ggml_numa_nodes
{
struct ggml_numa_node *nodes;
uint32_t n_nodes;
uint32_t total_cpus; // hardware threads on system
};

struct ggml_numa_nodes ggml_numa = {
.nodes = NULL,
.n_nodes = 0,
.total_cpus = 0,
};
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has to become part of g_state


void ggml_numa_init(void)
{
if (ggml_numa.n_nodes > 0) { return; }
#ifdef __linux__
struct stat st;
char path[256];
int rv;
// enumerate nodes
while (true) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", ggml_numa.n_nodes);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) != 0) { break; }
++ggml_numa.n_nodes;
}
// enumerate CPUs
while (true) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", ggml_numa.total_cpus);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) != 0) { break; }
++ggml_numa.total_cpus;
}
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", ggml_numa.n_nodes, ggml_numa.total_cpus);
if (ggml_numa.n_nodes < 1 || ggml_numa.total_cpus < 1) {
ggml_numa.n_nodes = 0;
return;
}
ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: Call to 'calloc' has an allocation size of 0 bytes [clang-analyzer-optin.portability.UnixAPI]

    ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
                      ^
Additional context

ggml.c:507: Assuming field 'n_nodes' is <= 0

    if (ggml_numa.n_nodes > 0) return;
        ^

ggml.c:507: Taking false branch

    if (ggml_numa.n_nodes > 0) return;
    ^

ggml.c:513: Loop condition is true. Entering loop body

    while (true) {
    ^

ggml.c:515: Assuming 'rv' is > 0

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                    ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
              ^

ggml.c:515: Left side of '&&' is true

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                    ^

ggml.c:515: Assuming the condition is true

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                              ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
              ^

ggml.c:515: Taking false branch

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
        ^

ggml.c:515: Loop condition is false. Exiting loop

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        ^

ggml.h:203: expanded from macro 'GGML_ASSERT'

    do { \
    ^

ggml.c:516: Assuming the condition is true

        if (stat(path, &st) != 0) break;
            ^

ggml.c:516: Taking true branch

        if (stat(path, &st) != 0) break;
        ^

ggml.c:516: Execution continues on line 521

        if (stat(path, &st) != 0) break;
                                  ^

ggml.c:520: Loop condition is true. Entering loop body

    while (true) {
    ^

ggml.c:522: Assuming 'rv' is > 0

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                    ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
              ^

ggml.c:522: Left side of '&&' is true

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                    ^

ggml.c:522: Assuming the condition is true

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                              ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
              ^

ggml.c:522: Taking false branch

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
        ^

ggml.c:522: Loop condition is false. Exiting loop

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        ^

ggml.h:203: expanded from macro 'GGML_ASSERT'

    do { \
    ^

ggml.c:523: Assuming the condition is true

        if (stat(path, &st) != 0) break;
            ^

ggml.c:523: Taking true branch

        if (stat(path, &st) != 0) break;
        ^

ggml.c:523: Execution continues on line 528

        if (stat(path, &st) != 0) break;
                                  ^

ggml.c:527: Call to 'calloc' has an allocation size of 0 bytes

    ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
                      ^

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid allocs or figure out a way to release the memory at the end of the program.
Alternatively, use static size arrays with max allowed size as constants

GGML_ASSERT(ggml_numa.nodes != NULL);
for (uint32_t n = 0; n < ggml_numa.n_nodes; ++n) {
struct ggml_numa_node *node = &ggml_numa.nodes[n];
node->cpus = calloc(ggml_numa.total_cpus, sizeof(uint32_t));
GGML_ASSERT(node->cpus != NULL);
GGML_PRINT_DEBUG("CPUs on node %u:", n);
for (uint32_t c = 0; c < ggml_numa.total_cpus; ++c) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) == 0) {
node->cpus[node->n_cpus++] = c;
GGML_PRINT_DEBUG(" %u", c);
}
}
GGML_PRINT_DEBUG("\n");
}
if (ggml_is_numa()) {
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r");
if (fptr != NULL) {
char buf[42];
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) {
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n");
}
fclose(fptr);
}
}
#else
// TODO
#endif
}

bool ggml_is_numa(void) { return ggml_numa.n_nodes > 1; }

//
// cache line
//
Expand Down Expand Up @@ -3405,30 +3516,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
*s = 1.f/(*s);
}

//
// logging
//

#if (GGML_DEBUG >= 1)
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG(...)
#endif

#if (GGML_DEBUG >= 5)
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_5(...)
#endif

#if (GGML_DEBUG >= 10)
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_10(...)
#endif

#define GGML_PRINT(...) printf(__VA_ARGS__)

//
// data types
//
Expand Down Expand Up @@ -3615,6 +3702,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"f(x,y)",
};

// only send finalize op to thread pool if it actually does something
// currently none of them?
static const bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = {
0
};

static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51");

static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
Expand Down Expand Up @@ -13966,6 +14059,49 @@ typedef pthread_t ggml_thread_t;

#endif

#ifdef __linux__
void set_numa_thread_affinity(int thread_n, int n_threads)
{
if (!ggml_is_numa()) { return; }
// run thread on node_num thread_n / (threads per node)
int node_num = thread_n / (n_threads / ggml_numa.n_nodes);
struct ggml_numa_node *node = &ggml_numa.nodes[node_num];
size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
CPU_ZERO_S(setsize, cpus);
for (size_t i = 0; i < node->n_cpus; ++i) {
CPU_SET_S(node->cpus[i], setsize, cpus);
}
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
strerror(rv));
}
CPU_FREE(cpus);
}
void clear_numa_thread_affinity(void)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function does not seem to be used

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The workers don't need it because they terminate. In the first attempt I wasn't setting affinity for the main thread, which seems to be fine because then it runs on the last remaining core. When I tried setting it performance was degraded if it stayed set after eval, so I wrote that to clear it when the worker threads were joined. That solved the performance degradation, but setting it and clearing it seemed to perform the same as not setting it to begin with, so then I didn't use it.

{
if (!ggml_is_numa()) { return; }
size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
CPU_ZERO_S(setsize, cpus);
for (unsigned i = 0; i < ggml_numa.total_cpus; ++i) {
CPU_SET_S(i, setsize, cpus);
}
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
strerror(rv));
}
CPU_FREE(cpus);
}
#else
// TODO: Windows etc.
// (the linux implementation may also work on BSD, someone should test)
void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); }
void clear_numa_thread_affinity() {}
#endif

struct ggml_compute_state_shared {
ggml_lock_t spin;

Expand All @@ -13990,6 +14126,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;

const int n_threads = state->shared->n_threads;
set_numa_thread_affinity(state->params.ith, n_threads);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we do this only if NUMA ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first line of the function returns if the system doesn't have at least two nodes, in which case it wouldn't have anything to do anyway because it would just be pinning all the threads to all the cores, which is the default.


while (true) {
if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {
Expand Down Expand Up @@ -14414,7 +14551,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
}

// FINALIZE
if (node->n_tasks > 1) {
if (node->n_tasks > 1 && GGML_OP_HAS_FINALIZE[node->op]) {
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: Dereference of null pointer [clang-analyzer-core.NullDereference]

+) {
                                       ^
Additional context

ggml.c:14165: Assuming 'n_threads' is <= 1

  };
                                               ^

ggml.c:14165: '?' condition is false

  };
                                               ^

ggml.c:14165: 'workers' initialized to a null pointer value

  };
         ^

ggml.c:14168: 'n_threads' is <= 1

pool
             ^

ggml.c:14168: Taking false branch

pool
         ^

ggml.c:14198: Assuming 'i' is < field 'n_nodes'

ions
                             ^

ggml.c:14198: Loop condition is true. Entering loop body

ions
             ^

ggml.c:14201: Control jumps to 'case GGML_OP_MAP_UNARY:' at line 14432

i];
                 ^

ggml.c:14435: Execution continues on line 14199

= 1;
                           ^

ggml.c:14198: Assuming 'i' is >= field 'n_nodes'

ions
                             ^

ggml.c:14198: Loop condition is false. Execution continues on line 14448

ions
             ^

ggml.c:14447: Assuming field 'work' is equal to NULL

  }
                 ^

ggml.c:14447: Left side of '&&' is false

  }
                                      ^

ggml.c:14451: 'work_size' is <= 0

  }
                 ^

ggml.c:14451: Left side of '&&' is false

  }
                               ^

ggml.c:14462: Loop condition is true. Entering loop body

();
         ^

ggml.c:14480: Field 'work' is null

sks,
                                      ^

ggml.c:14480: '?' condition is false

sks,
                              ^

ggml.c:14481: Field 'work' is null

: 0,
                                      ^

ggml.c:14481: '?' condition is false

: 0,
                              ^

ggml.c:14484: Calling 'ggml_compute_forward'

 };
             ^

ggml.c:12939: 'params' is non-null

r) {
                     ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
              ^

ggml.c:12939: Taking false branch

r) {
         ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
        ^

ggml.c:12939: Loop condition is false. Exiting loop

r) {
         ^

ggml.h:203: expanded from macro 'GGML_ASSERT'

    do { \
    ^

ggml.c:12941: Control jumps to 'case GGML_OP_MAP_UNARY:' at line 13138

s);
         ^

ggml.c:13140: Calling 'ggml_compute_forward_map_unary'

ta);
                     ^

ggml.c:12875: Control jumps to 'case GGML_TYPE_F32:' at line 12877

n) {
         ^

ggml.c:12878: Calling 'ggml_compute_forward_map_unary_f32'

   {
                     ^

ggml.c:12850: Taking false branch

n) {
         ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
        ^

ggml.c:12850: Loop condition is false. Exiting loop

n) {
         ^

ggml.h:203: expanded from macro 'GGML_ASSERT'

    do { \
    ^

ggml.c:12852: Field 'type' is equal to GGML_TASK_INIT

));
                     ^

ggml.c:12852: Left side of '||' is true

));
                                            ^

ggml.c:12853: Returning without writing to 'dst->op', which participates in a condition later

E) {
             ^

ggml.c:12878: Returning from 'ggml_compute_forward_map_unary_f32'

   {
                     ^

ggml.c:12879: Execution continues on line 12879

un);
                   ^

ggml.c:12887: Returning without writing to 'dst->op', which participates in a condition later

}
  ^

ggml.c:13140: Returning from 'ggml_compute_forward_map_unary'

ta);
                     ^

ggml.c:13142: Execution continues on line 13141

   }
                 ^

ggml.c:13160: Returning without writing to 'tensor->op', which participates in a condition later

}
  ^

ggml.c:14484: Returning from 'ggml_compute_forward'

 };
             ^

ggml.c:14487: Field 'n_tasks' is <= 1

PUTE
                       ^

ggml.c:14487: Taking false branch

PUTE
             ^

ggml.c:14520: Calling 'ggml_compute_forward'

UTE;
             ^

ggml.c:12939: 'params' is non-null

r) {
                     ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
              ^

ggml.c:12939: Taking false branch

r) {
         ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
        ^

ggml.c:12939: Loop condition is false. Exiting loop

r) {
         ^

ggml.h:203: expanded from macro 'GGML_ASSERT'

    do { \
    ^

ggml.c:12941: Control jumps to 'case GGML_OP_MAP_UNARY:' at line 13138

s);
         ^

ggml.c:13140: Calling 'ggml_compute_forward_map_unary'

ta);
                     ^

ggml.c:12875: Control jumps to 'case GGML_TYPE_F32:' at line 12877

n) {
         ^

ggml.c:12878: Value assigned to field 'op', which participates in a condition later

   {
                     ^

ggml.c:12879: Execution continues on line 12879

un);
                   ^

ggml.c:13140: Returning from 'ggml_compute_forward_map_unary'

ta);
                     ^

ggml.c:13142: Execution continues on line 13141

   }
                 ^

ggml.c:14520: Returning from 'ggml_compute_forward'

UTE;
             ^

ggml.c:14523: Assuming field 'n_tasks' is > 1

pool
                 ^

ggml.c:14523: Taking true branch

pool
             ^

ggml.c:14524: Assuming the condition is false

1) {
                     ^

/usr/lib/llvm-15/lib/clang/15.0.7/include/stdatomic.h:141: expanded from macro 'atomic_fetch_add'

#define atomic_fetch_add(object, operand) __c11_atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
                                          ^

ggml.c:14524: Taking false branch

1) {
                 ^

ggml.c:14528: Loop condition is false. Execution continues on line 14534

  }
                 ^

ggml.c:14535: Loop condition is false. Execution continues on line 14543

1);
                 ^

ggml.c:14542: Field 'n_tasks' is > 1

LIZE
                       ^

ggml.c:14542: Left side of '&&' is true

LIZE
                 ^

ggml.c:14542: Assuming the condition is true

LIZE
                                      ^

ggml.c:14542: Taking true branch

LIZE
             ^

ggml.c:14543: Assuming the condition is false

]) {
                     ^

/usr/lib/llvm-15/lib/clang/15.0.7/include/stdatomic.h:141: expanded from macro 'atomic_fetch_add'

#define atomic_fetch_add(object, operand) __c11_atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
                                          ^

ggml.c:14543: Taking false branch

]) {
                 ^

ggml.c:14547: Loop condition is false. Execution continues on line 14554

  }
                 ^

ggml.c:14553: Assuming the condition is true

pool
                                 ^

ggml.c:14553: Loop condition is true. Entering loop body

pool
                 ^

ggml.c:14558: Field 'work' is null

sks,
                                          ^

ggml.c:14558: '?' condition is false

sks,
                                  ^

ggml.c:14559: Field 'work' is null

: 0,
                                          ^

ggml.c:14559: '?' condition is false

: 0,
                                  ^

ggml.c:14554: Dereference of null pointer

+) {
                                       ^

atomic_store(&state_shared.has_work, false);
}
Expand Down Expand Up @@ -14450,7 +14587,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
ggml_compute_forward(&params, node);

// wait for thread pool
if (node->n_tasks > 1) {
if (node->n_tasks > 1 && GGML_OP_HAS_FINALIZE[node->op]) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good change

if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {
atomic_store(&state_shared.has_work, false);
}
Expand Down
3 changes: 3 additions & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,9 @@ extern "C" {
GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void);

GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node

GGML_API void ggml_print_object (const struct ggml_object * obj);
GGML_API void ggml_print_objects(const struct ggml_context * ctx);

Expand Down
15 changes: 14 additions & 1 deletion llama-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,9 @@ static std::string llama_format_win_err(DWORD err) {
}
#endif

extern "C" {
bool ggml_is_numa();
}
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has to be avoided. Probably utilize new GGML_USE_NUMA define

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this function is still needed, because even if NUMA support is compiled in, there are some operations that only make sense if the system actually has more than one NUMA node. It also makes it easier to disable those operations, because that function always returns false if ggml_numa_init() is never called, so we can add a --numa option and not call ggml_numa_init() otherwise, causing those operations to be disabled. The latest commits do this.

struct llama_mmap {
void * addr;
size_t size;
Expand All @@ -176,8 +179,10 @@ struct llama_mmap {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
if (ggml_is_numa()) { prefetch = 0; }
#ifdef __linux__
flags |= MAP_POPULATE;
if (prefetch) { flags |= MAP_POPULATE; }
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
Expand All @@ -191,6 +196,14 @@ struct llama_mmap {
strerror(errno));
}
}
if (ggml_is_numa()) {
// advise the kernel not to use readahead
// (because the next page might not belong on the same node)
if (madvise(addr, file->size, MADV_RANDOM)) {
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
strerror(errno));
}
}
}

~llama_mmap() {
Expand Down
1 change: 1 addition & 0 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,7 @@ bool llama_mlock_supported() {

void llama_init_backend() {
ggml_time_init();
ggml_numa_init();

// needed to initialize f16 tables
{
Expand Down