Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Numa #1556

Merged
merged 20 commits into from
Jun 26, 2023
Merged

Numa #1556

Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
silence robot
  • Loading branch information
zrm committed May 23, 2023
commit 2c1b5ae1971c04df575cae4c828c0b3ef8fb57dc
26 changes: 15 additions & 11 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ struct ggml_numa_nodes ggml_numa = {

void ggml_numa_init(void)
{
if (ggml_numa.n_nodes > 0) return;
if (ggml_numa.n_nodes > 0) { return; }
#ifdef __linux__
struct stat st;
char path[256];
Expand All @@ -514,17 +514,21 @@ void ggml_numa_init(void)
while (true) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/node/node%u", ggml_numa.n_nodes);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) != 0) break;
if (stat(path, &st) != 0) { break; }
++ggml_numa.n_nodes;
}
// enumerate CPUs
while (true) {
rv = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%u", ggml_numa.total_cpus);
GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
if (stat(path, &st) != 0) break;
if (stat(path, &st) != 0) { break; }
++ggml_numa.total_cpus;
}
GGML_PRINT_DEBUG("found %u numa nodes, %u CPUs\n", ggml_numa.n_nodes, ggml_numa.total_cpus);
if (ggml_numa.n_nodes < 1 || ggml_numa.total_cpus < 1) {
ggml_numa.n_nodes = 0;
return;
}
ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: Call to 'calloc' has an allocation size of 0 bytes [clang-analyzer-optin.portability.UnixAPI]

    ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
                      ^
Additional context

ggml.c:507: Assuming field 'n_nodes' is <= 0

    if (ggml_numa.n_nodes > 0) return;
        ^

ggml.c:507: Taking false branch

    if (ggml_numa.n_nodes > 0) return;
    ^

ggml.c:513: Loop condition is true. Entering loop body

    while (true) {
    ^

ggml.c:515: Assuming 'rv' is > 0

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                    ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
              ^

ggml.c:515: Left side of '&&' is true

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                    ^

ggml.c:515: Assuming the condition is true

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                              ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
              ^

ggml.c:515: Taking false branch

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
        ^

ggml.c:515: Loop condition is false. Exiting loop

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        ^

ggml.h:203: expanded from macro 'GGML_ASSERT'

    do { \
    ^

ggml.c:516: Assuming the condition is true

        if (stat(path, &st) != 0) break;
            ^

ggml.c:516: Taking true branch

        if (stat(path, &st) != 0) break;
        ^

ggml.c:516: Execution continues on line 521

        if (stat(path, &st) != 0) break;
                                  ^

ggml.c:520: Loop condition is true. Entering loop body

    while (true) {
    ^

ggml.c:522: Assuming 'rv' is > 0

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                    ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
              ^

ggml.c:522: Left side of '&&' is true

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                    ^

ggml.c:522: Assuming the condition is true

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
                              ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
              ^

ggml.c:522: Taking false branch

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        ^

ggml.h:204: expanded from macro 'GGML_ASSERT'

        if (!(x)) { \
        ^

ggml.c:522: Loop condition is false. Exiting loop

        GGML_ASSERT(rv > 0 && (unsigned)rv < sizeof(path));
        ^

ggml.h:203: expanded from macro 'GGML_ASSERT'

    do { \
    ^

ggml.c:523: Assuming the condition is true

        if (stat(path, &st) != 0) break;
            ^

ggml.c:523: Taking true branch

        if (stat(path, &st) != 0) break;
        ^

ggml.c:523: Execution continues on line 528

        if (stat(path, &st) != 0) break;
                                  ^

ggml.c:527: Call to 'calloc' has an allocation size of 0 bytes

    ggml_numa.nodes = calloc(ggml_numa.n_nodes, sizeof(struct ggml_numa_node));
                      ^

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid allocs or figure out a way to release the memory at the end of the program.
Alternatively, use static size arrays with max allowed size as constants

GGML_ASSERT(ggml_numa.nodes != NULL);
for (uint32_t n = 0; n < ggml_numa.n_nodes; ++n) {
Expand Down Expand Up @@ -14058,34 +14062,34 @@ typedef pthread_t ggml_thread_t;
#ifdef __linux__
void set_numa_thread_affinity(int thread_n, int n_threads)
{
if (!ggml_is_numa()) return;
if (!ggml_is_numa()) { return; }
// run thread on node_num thread_n / (threads per node)
int node_num = thread_n / (n_threads / ggml_numa.n_nodes);
struct ggml_numa_node *node = &ggml_numa.nodes[node_num];
size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
CPU_ZERO_S(setsize, cpus);
for (size_t i=0; i < node->n_cpus; ++i) {
for (size_t i = 0; i < node->n_cpus; ++i) {
CPU_SET_S(node->cpus[i], setsize, cpus);
}
int rv;
if ((rv = pthread_setaffinity_np(pthread_self(), setsize, cpus))) {
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
strerror(rv));
}
CPU_FREE(cpus);
}
void clear_numa_thread_affinity(void)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function does not seem to be used

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The workers don't need it because they terminate. In the first attempt I wasn't setting affinity for the main thread, which seems to be fine because then it runs on the last remaining core. When I tried setting it performance was degraded if it stayed set after eval, so I wrote that to clear it when the worker threads were joined. That solved the performance degradation, but setting it and clearing it seemed to perform the same as not setting it to begin with, so then I didn't use it.

{
if (!ggml_is_numa()) return;
if (!ggml_is_numa()) { return; }
size_t setsize = CPU_ALLOC_SIZE(ggml_numa.total_cpus);
cpu_set_t *cpus = CPU_ALLOC(ggml_numa.total_cpus);
CPU_ZERO_S(setsize, cpus);
for (unsigned i=0; i < ggml_numa.total_cpus; ++i) {
for (unsigned i = 0; i < ggml_numa.total_cpus; ++i) {
CPU_SET_S(i, setsize, cpus);
}
int rv;
if((rv = pthread_setaffinity_np(pthread_self(), setsize, cpus))) {
int rv = pthread_setaffinity_np(pthread_self(), setsize, cpus);
if (rv) {
fprintf(stderr, "warning: pthread_setaffinity_np() failed: %s\n",
strerror(rv));
}
Expand Down
4 changes: 2 additions & 2 deletions llama-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,9 @@ struct llama_mmap {
int fd = fileno(file->fp);
int flags = MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
if (ggml_is_numa()) prefetch = 0;
if (ggml_is_numa()) { prefetch = 0; }
#ifdef __linux__
if (prefetch) flags |= MAP_POPULATE;
if (prefetch) { flags |= MAP_POPULATE; }
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
Expand Down