-
Notifications
You must be signed in to change notification settings - Fork 9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Numa #1556
Numa #1556
Changes from 5 commits
6fc5f17
0d23f8c
9d058c2
2c1b5ae
8502d51
bf83dcb
b71dfe6
adaad10
c31d51d
2f5bb46
4b94582
90a0e65
d0e3596
67ba34e
8f98035
0fe4b00
875a1e1
4a555b4
81a40e9
9aec2b7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -76,6 +76,11 @@ static int sched_yield (void) { | |
#include <stdatomic.h> | ||
|
||
typedef void* thread_ret_t; | ||
|
||
#include <sys/types.h> | ||
#include <sys/stat.h> | ||
#include <unistd.h> | ||
|
||
#endif | ||
|
||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 | ||
|
@@ -103,6 +108,30 @@ typedef void* thread_ret_t; | |
#define GGML_SOFT_MAX_UNROLL 4 | ||
#define GGML_VEC_DOT_UNROLL 2 | ||
|
||
// | ||
// logging | ||
// | ||
|
||
#if (GGML_DEBUG >= 1) | ||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) | ||
#else | ||
#define GGML_PRINT_DEBUG(...) | ||
#endif | ||
|
||
#if (GGML_DEBUG >= 5) | ||
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) | ||
#else | ||
#define GGML_PRINT_DEBUG_5(...) | ||
#endif | ||
|
||
#if (GGML_DEBUG >= 10) | ||
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) | ||
#else | ||
#define GGML_PRINT_DEBUG_10(...) | ||
#endif | ||
|
||
#define GGML_PRINT(...) printf(__VA_ARGS__) | ||
|
||
#ifdef GGML_USE_ACCELERATE | ||
// uncomment to use vDSP for soft max computation | ||
// note: not sure if it is actually faster | ||
|
@@ -395,7 +424,6 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) { | |
} | ||
} | ||
|
||
|
||
// | ||
// timing | ||
// | ||
|
@@ -452,6 +480,89 @@ int64_t ggml_cycles_per_ms(void) { | |
#define ggml_perf_cycles_per_ms() 0 | ||
#endif | ||
|
||
// | ||
// NUMA support | ||
// | ||
|
||
struct ggml_numa_node | ||
{ | ||
uint32_t *cpus; // hardware threads on this node | ||
uint32_t n_cpus; | ||
}; | ||
|
||
struct ggml_numa_nodes | ||
{ | ||
struct ggml_numa_node *nodes; | ||
uint32_t n_nodes; | ||
uint32_t total_cpus; // hardware threads on system | ||
}; | ||
|
||
struct ggml_numa_nodes ggml_numa = { | ||
.nodes = NULL, | ||
.n_nodes = 0, | ||
.total_cpus = 0, | ||
}; | ||
|
||
void ggml_numa_init(void) | ||
{ | ||
if (ggml_numa.n_nodes > 0) { return; } | ||
#ifdef __linux__ | ||
struct stat st; | ||
char path[256]; | ||
int rv; | ||
// enumerate nodes | ||
while (true) { | ||
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", ggml_numa.n_nodes); | ||
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); | ||
if (stat(path, &st) != 0) { break; } | ||
++ggml_numa.n_nodes; | ||
} | ||
// enumerate CPUs | ||
while (true) { | ||
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", ggml_numa.total_cpus); | ||
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); | ||
if (stat(path, &st) != 0) { break; } | ||
++ggml_numa.total_cpus; | ||
} | ||
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", ggml_numa.n_nodes, ggml_numa.total_cpus); | ||
if (ggml_numa.n_nodes < 1 || ggml_numa.total_cpus < 1) { | ||
ggml_numa.n_nodes = 0; | ||
return; | ||
} | ||
ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: Call to 'calloc' has an allocation size of 0 bytes [clang-analyzer-optin.portability.UnixAPI] ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
^ Additional contextggml.c:507: Assuming field 'n_nodes' is <= 0 if (ggml_numa.n_nodes > 0) return;
^ ggml.c:507: Taking false branch if (ggml_numa.n_nodes > 0) return;
^ ggml.c:513: Loop condition is true. Entering loop body while (true) {
^ ggml.c:515: Assuming 'rv' is > 0 GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:515: Left side of '&&' is true GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
^ ggml.c:515: Assuming the condition is true GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:515: Taking false branch GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:515: Loop condition is false. Exiting loop GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
^ ggml.h:203: expanded from macro 'GGML_ASSERT' do { \
^ ggml.c:516: Assuming the condition is true if (stat(path, &st) != 0) break;
^ ggml.c:516: Taking true branch if (stat(path, &st) != 0) break;
^ ggml.c:516: Execution continues on line 521 if (stat(path, &st) != 0) break;
^ ggml.c:520: Loop condition is true. Entering loop body while (true) {
^ ggml.c:522: Assuming 'rv' is > 0 GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:522: Left side of '&&' is true GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
^ ggml.c:522: Assuming the condition is true GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:522: Taking false branch GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:522: Loop condition is false. Exiting loop GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
^ ggml.h:203: expanded from macro 'GGML_ASSERT' do { \
^ ggml.c:523: Assuming the condition is true if (stat(path, &st) != 0) break;
^ ggml.c:523: Taking true branch if (stat(path, &st) != 0) break;
^ ggml.c:523: Execution continues on line 528 if (stat(path, &st) != 0) break;
^ ggml.c:527: Call to 'calloc' has an allocation size of 0 bytes ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
^ There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid allocs or figure out a way to release the memory at the end of the program. |
||
GGML_ASSERT(ggml_numa.nodes != NULL); | ||
for (uint32_t n = 0; n < ggml_numa.n_nodes; ++n) { | ||
struct ggml_numa_node *node = &ggml_numa.nodes[n]; | ||
node->cpus = calloc(ggml_numa.total_cpus, sizeof(uint32_t)); | ||
GGML_ASSERT(node->cpus != NULL); | ||
GGML_PRINT_DEBUG("CPUs on node %u:", n); | ||
for (uint32_t c = 0; c < ggml_numa.total_cpus; ++c) { | ||
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u/cpu%u", n, c); | ||
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path)); | ||
if (stat(path, &st) == 0) { | ||
node->cpus[node->n_cpus++] = c; | ||
GGML_PRINT_DEBUG(" %u", c); | ||
} | ||
} | ||
GGML_PRINT_DEBUG("\n"); | ||
} | ||
if (ggml_is_numa()) { | ||
FILE *fptr = fopen("/proc/sys/kernel/numa_balancing", "r"); | ||
if (fptr != NULL) { | ||
char buf[42]; | ||
if (fgets(buf, sizeof(buf), fptr) && strncmp(buf, "0\n", sizeof(buf)) != 0) { | ||
GGML_PRINT("WARNING: /proc/sys/kernel/numa_balancing is enabled, this has been observed to impair performance\n"); | ||
} | ||
fclose(fptr); | ||
} | ||
} | ||
#else | ||
// TODO | ||
#endif | ||
} | ||
|
||
bool ggml_is_numa(void) { return ggml_numa.n_nodes > 1; } | ||
|
||
// | ||
// cache line | ||
// | ||
|
@@ -3405,30 +3516,6 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x | |
*s = 1.f/(*s); | ||
} | ||
|
||
// | ||
// logging | ||
// | ||
|
||
#if (GGML_DEBUG >= 1) | ||
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) | ||
#else | ||
#define GGML_PRINT_DEBUG(...) | ||
#endif | ||
|
||
#if (GGML_DEBUG >= 5) | ||
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) | ||
#else | ||
#define GGML_PRINT_DEBUG_5(...) | ||
#endif | ||
|
||
#if (GGML_DEBUG >= 10) | ||
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) | ||
#else | ||
#define GGML_PRINT_DEBUG_10(...) | ||
#endif | ||
|
||
#define GGML_PRINT(...) printf(__VA_ARGS__) | ||
|
||
// | ||
// data types | ||
// | ||
|
@@ -3615,6 +3702,12 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { | |
"f(x,y)", | ||
}; | ||
|
||
// only send finalize op to thread pool if it actually does something | ||
// currently none of them? | ||
static const bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { | ||
0 | ||
}; | ||
|
||
static_assert(GGML_OP_COUNT == 51, "GGML_OP_COUNT != 51"); | ||
|
||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); | ||
|
@@ -13966,6 +14059,49 @@ typedef pthread_t ggml_thread_t; | |
|
||
#endif | ||
|
||
#ifdef __linux__ | ||
void set_numa_thread_affinity(int thread_n, int n_threads) | ||
{ | ||
if (!ggml_is_numa()) { return; } | ||
// run thread on node_num thread_n / (threads per node) | ||
int node_num = thread_n / (n_threads / ggml_numa.n_nodes); | ||
struct ggml_numa_node *node = &ggml_numa.nodes[node_num]; | ||
size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus); | ||
cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus); | ||
CPU_ZERO_S(setsize, cpus); | ||
for (size_t i = 0; i < node->n_cpus; ++i) { | ||
CPU_SET_S(node->cpus[i], setsize, cpus); | ||
} | ||
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); | ||
if (rv) { | ||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", | ||
strerror(rv)); | ||
} | ||
CPU_FREE(cpus); | ||
} | ||
void clear_numa_thread_affinity(void) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function does not seem to be used There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The workers don't need it because they terminate. In the first attempt I wasn't setting affinity for the main thread, which seems to be fine because then it runs on the last remaining core. When I tried setting it performance was degraded if it stayed set after eval, so I wrote that to clear it when the worker threads were joined. That solved the performance degradation, but setting it and clearing it seemed to perform the same as not setting it to begin with, so then I didn't use it. |
||
{ | ||
if (!ggml_is_numa()) { return; } | ||
size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus); | ||
cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus); | ||
CPU_ZERO_S(setsize, cpus); | ||
for (unsigned i = 0; i < ggml_numa.total_cpus; ++i) { | ||
CPU_SET_S(i, setsize, cpus); | ||
} | ||
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus); | ||
if (rv) { | ||
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n", | ||
strerror(rv)); | ||
} | ||
CPU_FREE(cpus); | ||
} | ||
#else | ||
// TODO: Windows etc. | ||
// (the linux implementation may also work on BSD, someone should test) | ||
void set_numa_thread_affinity(int thread_n, int n_threads) { UNUSED(thread_n); UNUSED(n_threads); } | ||
void clear_numa_thread_affinity() {} | ||
#endif | ||
|
||
struct ggml_compute_state_shared { | ||
ggml_lock_t spin; | ||
|
||
|
@@ -13990,6 +14126,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { | |
struct ggml_compute_state * state = (struct ggml_compute_state *) data; | ||
|
||
const int n_threads = state->shared->n_threads; | ||
set_numa_thread_affinity(state->params.ith, n_threads); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we do this only if NUMA ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The first line of the function returns if the system doesn't have at least two nodes, in which case it wouldn't have anything to do anyway because it would just be pinning all the threads to all the cores, which is the default. |
||
|
||
while (true) { | ||
if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) { | ||
|
@@ -14414,7 +14551,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) | |
} | ||
|
||
// FINALIZE | ||
if (node->n_tasks > 1) { | ||
if (node->n_tasks > 1 && GGML_OP_HAS_FINALIZE[node->op]) { | ||
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. warning: Dereference of null pointer [clang-analyzer-core.NullDereference] +) {
^ Additional contextggml.c:14165: Assuming 'n_threads' is <= 1 };
^ ggml.c:14165: '?' condition is false };
^ ggml.c:14165: 'workers' initialized to a null pointer value };
^ ggml.c:14168: 'n_threads' is <= 1 pool
^ ggml.c:14168: Taking false branch pool
^ ggml.c:14198: Assuming 'i' is < field 'n_nodes' ions
^ ggml.c:14198: Loop condition is true. Entering loop body ions
^ ggml.c:14201: Control jumps to 'case GGML_OP_MAP_UNARY:' at line 14432 i];
^ ggml.c:14435: Execution continues on line 14199 = 1;
^ ggml.c:14198: Assuming 'i' is >= field 'n_nodes' ions
^ ggml.c:14198: Loop condition is false. Execution continues on line 14448 ions
^ ggml.c:14447: Assuming field 'work' is equal to NULL }
^ ggml.c:14447: Left side of '&&' is false }
^ ggml.c:14451: 'work_size' is <= 0 }
^ ggml.c:14451: Left side of '&&' is false }
^ ggml.c:14462: Loop condition is true. Entering loop body ();
^ ggml.c:14480: Field 'work' is null sks,
^ ggml.c:14480: '?' condition is false sks,
^ ggml.c:14481: Field 'work' is null : 0,
^ ggml.c:14481: '?' condition is false : 0,
^ ggml.c:14484: Calling 'ggml_compute_forward' };
^ ggml.c:12939: 'params' is non-null r) {
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:12939: Taking false branch r) {
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:12939: Loop condition is false. Exiting loop r) {
^ ggml.h:203: expanded from macro 'GGML_ASSERT' do { \
^ ggml.c:12941: Control jumps to 'case GGML_OP_MAP_UNARY:' at line 13138 s);
^ ggml.c:13140: Calling 'ggml_compute_forward_map_unary' ta);
^ ggml.c:12875: Control jumps to 'case GGML_TYPE_F32:' at line 12877 n) {
^ ggml.c:12878: Calling 'ggml_compute_forward_map_unary_f32' {
^ ggml.c:12850: Taking false branch n) {
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:12850: Loop condition is false. Exiting loop n) {
^ ggml.h:203: expanded from macro 'GGML_ASSERT' do { \
^ ggml.c:12852: Field 'type' is equal to GGML_TASK_INIT ));
^ ggml.c:12852: Left side of '||' is true ));
^ ggml.c:12853: Returning without writing to 'dst->op', which participates in a condition later E) {
^ ggml.c:12878: Returning from 'ggml_compute_forward_map_unary_f32' {
^ ggml.c:12879: Execution continues on line 12879 un);
^ ggml.c:12887: Returning without writing to 'dst->op', which participates in a condition later }
^ ggml.c:13140: Returning from 'ggml_compute_forward_map_unary' ta);
^ ggml.c:13142: Execution continues on line 13141 }
^ ggml.c:13160: Returning without writing to 'tensor->op', which participates in a condition later }
^ ggml.c:14484: Returning from 'ggml_compute_forward' };
^ ggml.c:14487: Field 'n_tasks' is <= 1 PUTE
^ ggml.c:14487: Taking false branch PUTE
^ ggml.c:14520: Calling 'ggml_compute_forward' UTE;
^ ggml.c:12939: 'params' is non-null r) {
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:12939: Taking false branch r) {
^ ggml.h:204: expanded from macro 'GGML_ASSERT' if (!(x)) { \
^ ggml.c:12939: Loop condition is false. Exiting loop r) {
^ ggml.h:203: expanded from macro 'GGML_ASSERT' do { \
^ ggml.c:12941: Control jumps to 'case GGML_OP_MAP_UNARY:' at line 13138 s);
^ ggml.c:13140: Calling 'ggml_compute_forward_map_unary' ta);
^ ggml.c:12875: Control jumps to 'case GGML_TYPE_F32:' at line 12877 n) {
^ ggml.c:12878: Value assigned to field 'op', which participates in a condition later {
^ ggml.c:12879: Execution continues on line 12879 un);
^ ggml.c:13140: Returning from 'ggml_compute_forward_map_unary' ta);
^ ggml.c:13142: Execution continues on line 13141 }
^ ggml.c:14520: Returning from 'ggml_compute_forward' UTE;
^ ggml.c:14523: Assuming field 'n_tasks' is > 1 pool
^ ggml.c:14523: Taking true branch pool
^ ggml.c:14524: Assuming the condition is false 1) {
^ /usr/lib/llvm-15/lib/clang/15.0.7/include/stdatomic.h:141: expanded from macro 'atomic_fetch_add' #define atomic_fetch_add(object, operand) __c11_atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
^ ggml.c:14524: Taking false branch 1) {
^ ggml.c:14528: Loop condition is false. Execution continues on line 14534 }
^ ggml.c:14535: Loop condition is false. Execution continues on line 14543 1);
^ ggml.c:14542: Field 'n_tasks' is > 1 LIZE
^ ggml.c:14542: Left side of '&&' is true LIZE
^ ggml.c:14542: Assuming the condition is true LIZE
^ ggml.c:14542: Taking true branch LIZE
^ ggml.c:14543: Assuming the condition is false ]) {
^ /usr/lib/llvm-15/lib/clang/15.0.7/include/stdatomic.h:141: expanded from macro 'atomic_fetch_add' #define atomic_fetch_add(object, operand) __c11_atomic_fetch_add(object, operand, __ATOMIC_SEQ_CST)
^ ggml.c:14543: Taking false branch ]) {
^ ggml.c:14547: Loop condition is false. Execution continues on line 14554 }
^ ggml.c:14553: Assuming the condition is true pool
^ ggml.c:14553: Loop condition is true. Entering loop body pool
^ ggml.c:14558: Field 'work' is null sks,
^ ggml.c:14558: '?' condition is false sks,
^ ggml.c:14559: Field 'work' is null : 0,
^ ggml.c:14559: '?' condition is false : 0,
^ ggml.c:14554: Dereference of null pointer +) {
^ |
||
atomic_store(&state_shared.has_work, false); | ||
} | ||
|
@@ -14450,7 +14587,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) | |
ggml_compute_forward(¶ms, node); | ||
|
||
// wait for thread pool | ||
if (node->n_tasks > 1) { | ||
if (node->n_tasks > 1 && GGML_OP_HAS_FINALIZE[node->op]) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good change |
||
if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) { | ||
atomic_store(&state_shared.has_work, false); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -163,6 +163,9 @@ static std::string llama_format_win_err(DWORD err) { | |
} | ||
#endif | ||
|
||
extern "C" { | ||
bool ggml_is_numa(); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This has to be avoided. Probably utilize new There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this function is still needed, because even if NUMA support is compiled in, there are some operations that only make sense if the system actually has more than one NUMA node. It also makes it easier to disable those operations, because that function always returns false if ggml_numa_init() is never called, so we can add a --numa option and not call ggml_numa_init() otherwise, causing those operations to be disabled. The latest commits do this. |
||
struct llama_mmap { | ||
void * addr; | ||
size_t size; | ||
|
@@ -176,8 +179,10 @@ struct llama_mmap { | |
size = file->size; | ||
int fd = fileno(file->fp); | ||
int flags = MAP_SHARED; | ||
// prefetch/readahead impairs performance on NUMA systems | ||
if (ggml_is_numa()) { prefetch = 0; } | ||
#ifdef __linux__ | ||
flags |= MAP_POPULATE; | ||
if (prefetch) { flags |= MAP_POPULATE; } | ||
#endif | ||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); | ||
if (addr == MAP_FAILED) { | ||
|
@@ -191,6 +196,14 @@ struct llama_mmap { | |
strerror(errno)); | ||
} | ||
} | ||
if (ggml_is_numa()) { | ||
// advise the kernel not to use readahead | ||
// (because the next page might not belong on the same node) | ||
if (madvise(addr, file->size, MADV_RANDOM)) { | ||
fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n", | ||
strerror(errno)); | ||
} | ||
} | ||
} | ||
|
||
~llama_mmap() { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This has to become part of
g_state