Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Page based heap size heuristics #50144

Merged
merged 13 commits into from
Jul 23, 2023
Prev Previous commit
Next Next commit
Readd testenv changes
  • Loading branch information
gbaraldi committed Jul 13, 2023
commit 4da775f5daddad72e16d7596a9e0acff3919b3ca
255 changes: 255 additions & 0 deletions batch.diff
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
diff --git a/src/gc.c b/src/gc.c
Copy link
Sponsor Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove?

index c85d1e5455..c82b2b645d 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -6,6 +6,8 @@
#include "julia_gcext.h"
#include "julia_assert.h"
#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
#include <sys/types.h>
#ifdef __GLIBC__
#include <malloc.h> // for malloc_trim
@@ -1004,8 +1006,14 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
jl_atomic_store_relaxed(&ptls->gc_num.bigalloc,
jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
#ifdef MEMDEBUG
memset(v, 0xee, allocsz);
#endif
@@ -1051,8 +1059,10 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
if (nxt)
nxt->prev = pv;
gc_num.freed += v->sz&~3;
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, v->sz&~3);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(v->sz&~3));
+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
+ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) + (v->sz&~3));
+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
+ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) + (v->sz&~3));
#ifdef MEMDEBUG
memset(v, 0xbb, v->sz&~3);
#endif
@@ -1112,8 +1122,14 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
jl_ptls_t ptls = jl_current_task->ptls;
jl_atomic_store_relaxed(&ptls->gc_num.allocd,
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
}

static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
@@ -1126,12 +1142,15 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
jl_ptls_t ptls = gc_all_tls_states[i];
if (ptls) {
dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval);
- dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed);
dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc);
dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc);
dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc);
dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc);
- dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
+ jl_atomic_store_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd));
+ jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed));
+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_thresh - free_thresh + jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
}
}
}
@@ -1188,8 +1207,10 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
jl_free_aligned(d);
else
free(d);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, jl_array_nbytes(a));
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -jl_array_nbytes(a));
+ jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed,
+ jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed) + jl_array_nbytes(a));
+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size,
+ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_array_nbytes(a));
gc_num.freed += jl_array_nbytes(a);
gc_num.freecall++;
}
@@ -3589,8 +3610,14 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz);
jl_atomic_store_relaxed(&ptls->gc_num.malloc,
jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
}
return malloc(sz);
}
@@ -3606,8 +3633,14 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz);
jl_atomic_store_relaxed(&ptls->gc_num.malloc,
jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, nm*sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, nm*sz);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz*nm < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz*nm);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz*nm);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz*nm);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
}
return calloc(nm, sz);
}
@@ -3619,12 +3652,15 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
free(p);
if (pgcstack != NULL && ct->world_age) {
jl_ptls_t ptls = ct->ptls;
- jl_atomic_store_relaxed(&ptls->gc_num.freed,
- jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz);
- jl_atomic_store_relaxed(&ptls->gc_num.freecall,
- jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -sz);
+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
+ if (free_thresh + sz < 128*1024) {
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + sz);
+ }
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + sz));
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0);
+ }
}
}

@@ -3635,17 +3671,28 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
if (pgcstack != NULL && ct->world_age) {
jl_ptls_t ptls = ct->ptls;
maybe_collect(ptls);
- if (sz < old)
- jl_atomic_store_relaxed(&ptls->gc_num.freed,
- jl_atomic_load_relaxed(&ptls->gc_num.freed) + (old - sz));
- else
+ if (!(sz < old))
jl_atomic_store_relaxed(&ptls->gc_num.allocd,
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
jl_atomic_store_relaxed(&ptls->gc_num.realloc,
jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, old);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old);
+
+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
+ if (free_thresh + old < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + old);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + old);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + old));
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0);
+ }
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
}
return realloc(p, sz);
}
@@ -3720,8 +3767,14 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz);
jl_atomic_store_relaxed(&ptls->gc_num.malloc,
jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz);
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + sz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
int last_errno = errno;
#ifdef _OS_WINDOWS_
DWORD last_error = GetLastError();
@@ -3752,17 +3805,28 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz;
inc_live_bytes(allocsz - oldsz);
}
- else if (allocsz < oldsz)
- jl_atomic_store_relaxed(&ptls->gc_num.freed,
- jl_atomic_load_relaxed(&ptls->gc_num.freed) + (oldsz - allocsz));
- else
+ else if (!(allocsz < oldsz))
jl_atomic_store_relaxed(&ptls->gc_num.allocd,
jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz));
jl_atomic_store_relaxed(&ptls->gc_num.realloc,
jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, oldsz);
- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz);
+
+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh);
+ if (free_thresh + oldsz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + oldsz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + oldsz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + oldsz));
+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0);
+ }
+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh);
+ if (alloc_thresh + allocsz < 128*1024)
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + allocsz);
+ else {
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + allocsz);
+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + allocsz);
+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0);
+ }
int last_errno = errno;
#ifdef _OS_WINDOWS_
DWORD last_error = GetLastError();
diff --git a/src/julia_threads.h b/src/julia_threads.h
index f4c235243e..a672a92fb9 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -130,12 +130,12 @@ typedef struct {

typedef struct {
_Atomic(int64_t) allocd;
- _Atomic(int64_t) freed;
_Atomic(uint64_t) malloc;
_Atomic(uint64_t) realloc;
_Atomic(uint64_t) poolalloc;
_Atomic(uint64_t) bigalloc;
- _Atomic(uint64_t) freecall;
+ _Atomic(int64_t) free_thresh; // fiels used to batch fetch add operations for the GC
+ _Atomic(uint64_t) alloc_thresh;
} jl_thread_gc_num_t;

typedef struct {