From 5f36833b71e8bcce6ab9c89e8e98246638e465a1 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Mon, 3 Jul 2023 10:06:22 -0300 Subject: [PATCH 01/11] Implement new GC heuristics. --- NEWS.md | 1 + doc/src/devdocs/gc.md | 12 ++- src/gc-debug.c | 22 ++++- src/gc-pages.c | 4 + src/gc.c | 192 +++++++++++++++++++++++------------------- src/gc.h | 16 +++- 6 files changed, 156 insertions(+), 91 deletions(-) diff --git a/NEWS.md b/NEWS.md index 631806296eebd..2af55a888fc91 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,7 @@ Language changes Compiler/Runtime improvements ----------------------------- +* Updated GC heuristics to count allocated pages instead of individual objects ([#50144]). Command-line option changes --------------------------- diff --git a/doc/src/devdocs/gc.md b/doc/src/devdocs/gc.md index c072912e77c3f..942535f426b34 100644 --- a/doc/src/devdocs/gc.md +++ b/doc/src/devdocs/gc.md @@ -67,6 +67,12 @@ This scheme eliminates the need of explicitly keeping a flag to indicate a full ## Heuristics GC heuristics tune the GC by changing the size of the allocation interval between garbage collections. -If a GC was unproductive, then we increase the size of the allocation interval to allow objects more time to die. -If a GC returns a lot of space we can shrink the interval. The goal is to find a steady state where we are -allocating just about the same amount as we are collecting. + +The GC heuristics measure how big the heap size is after a collection and set the next +collection according to the algorithm described by https://dl.acm.org/doi/10.1145/3563323, +in summary, it argues that the heap target should have a square root relationship with the live heap, and that it should also be scaled by how fast the GC is freeing objects and how fast the mutators are allocating. +The heuristics measure the heap size by counting the number of pages that are in use and the objects that use malloc. Previously we measured the heap size by counting +the alive objects, but that doesn't take into account fragmentation which could lead to bad decisions, that also meant that we used thread local information (allocations) to make +decisions about a process wide (when to GC), measuring pages means the decision is global. + +The GC will do full collections when the heap size reaches 80% of the maximum allowed size. diff --git a/src/gc-debug.c b/src/gc-debug.c index bab2c5b0fa607..56441ae09b8e6 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -1,7 +1,10 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license #include "gc.h" +#include "julia.h" #include +#include +#include #include // re-include assert.h without NDEBUG, @@ -1216,15 +1219,30 @@ JL_DLLEXPORT void jl_enable_gc_logging(int enable) { gc_logging_enabled = enable; } -void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT { +void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT { if (!gc_logging_enabled) { return; } jl_safe_printf("GC: pause %.2fms. collected %fMB. %s %s\n", - pause/1e6, freed/1e6, + pause/1e6, freed/(double)(1<<20), full ? "full" : "incr", recollect ? "recollect" : "" ); + + jl_safe_printf("Heap stats: bytes_mapped %.2f MB, bytes_allocd %.2f MB\nbytes_freed %.2f MB, bytes_mallocd %.1f, malloc_bytes_freed %.2f MB\npages_perm_allocd %zu, heap_size %.2f MB, heap_target %.2f MB, live_bytes %.2f MB\n", + jl_atomic_load_relaxed(&gc_heap_stats.bytes_mapped)/(double)(1<<20), + jl_atomic_load_relaxed(&gc_heap_stats.bytes_allocd)/(double)(1<<20), + jl_atomic_load_relaxed(&gc_heap_stats.bytes_freed)/(double)(1<<20), + jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd)/(double)(1<<20), + jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed)/(double)(1<<20), + jl_atomic_load_relaxed(&gc_heap_stats.pages_perm_allocd), + jl_atomic_load_relaxed(&gc_heap_stats.heap_size)/(double)(1<<20), + jl_atomic_load_relaxed(&gc_heap_stats.heap_target)/(double)(1<<20), + live_bytes/(double)(1<<20) + ); + double bytes_mapped = (jl_atomic_load_relaxed(&gc_heap_stats.bytes_resident) + jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd) - jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed))/(double)(1<<20); + jl_safe_printf("Fragmentation %f, mapped_bytes %.2f MB\n", (double)live_bytes/(double)jl_atomic_load_relaxed(&gc_heap_stats.heap_size), bytes_mapped); + // Should fragmentation use bytes_resident instead of heap_size? } #ifdef __cplusplus diff --git a/src/gc-pages.c b/src/gc-pages.c index 682e76611f5d9..8d596f4a815ca 100644 --- a/src/gc-pages.c +++ b/src/gc-pages.c @@ -52,6 +52,8 @@ char *jl_gc_try_alloc_pages_(int pg_cnt) JL_NOTSAFEPOINT // round data pointer up to the nearest gc_page_data-aligned // boundary if mmap didn't already do so. mem = (char*)gc_page_data(mem + GC_PAGE_SZ - 1); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mapped, pages_sz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, pages_sz); return mem; } @@ -115,6 +117,7 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT // try to get page from `pool_freed` meta = pop_lf_page_metadata_back(&global_page_pool_freed); if (meta != NULL) { + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, GC_PAGE_SZ); gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED); goto exit; } @@ -188,6 +191,7 @@ void jl_gc_free_page(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT madvise(p, decommit_size, MADV_DONTNEED); #endif msan_unpoison(p, decommit_size); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, -decommit_size); } #ifdef __cplusplus diff --git a/src/gc.c b/src/gc.c index 9fd93b7340d56..325c4b023c824 100644 --- a/src/gc.c +++ b/src/gc.c @@ -178,6 +178,8 @@ jl_gc_num_t gc_num = {0}; static size_t last_long_collect_interval; int gc_n_threads; jl_ptls_t* gc_all_tls_states; +gc_heapstatus_t gc_heap_stats = {0}; +int next_sweep_full = 0; const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00 JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) { @@ -665,19 +667,26 @@ static int64_t last_gc_total_bytes = 0; #ifdef _P64 typedef uint64_t memsize_t; static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*); -static const size_t max_collect_interval = 1250000000UL; static size_t total_mem; // We expose this to the user/ci as jl_gc_set_max_memory static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024; #else typedef uint32_t memsize_t; static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*); -static const size_t max_collect_interval = 500000000UL; // Work really hard to stay within 2GB // Alternative is to risk running out of address space // on 32 bit architectures. -static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024; +#define MAX32HEAP 1536 * 1024 * 1024 +static memsize_t max_total_memory = (memsize_t) MAX32HEAP; #endif +// heuristic stuff for https://dl.acm.org/doi/10.1145/3563323 +static uint64_t old_pause_time = 0; +static uint64_t old_mut_time = 0; +static uint64_t old_heap_size = 0; +static uint64_t old_alloc_diff = 0; +static uint64_t old_freed_diff = 0; +static uint64_t gc_end_time = 0; + // global variables for GC stats @@ -912,7 +921,7 @@ void gc_setmark_buf(jl_ptls_t ptls, void *o, uint8_t mark_mode, size_t minsz) JL STATIC_INLINE void maybe_collect(jl_ptls_t ptls) { - if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) { + if (jl_atomic_load_relaxed(&gc_heap_stats.heap_size) >= jl_atomic_load_relaxed(&gc_heap_stats.heap_target) || jl_gc_debug_check_other()) { jl_gc_collect(JL_GC_AUTO); } else { @@ -1001,6 +1010,8 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); jl_atomic_store_relaxed(&ptls->gc_num.bigalloc, jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz); #ifdef MEMDEBUG memset(v, 0xee, allocsz); #endif @@ -1046,6 +1057,8 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT if (nxt) nxt->prev = pv; gc_num.freed += v->sz&~3; + jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, v->sz&~3); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(v->sz&~3)); #ifdef MEMDEBUG memset(v, 0xbb, v->sz&~3); #endif @@ -1105,6 +1118,8 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT jl_ptls_t ptls = jl_current_task->ptls; jl_atomic_store_relaxed(&ptls->gc_num.allocd, jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz); } static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT @@ -1179,6 +1194,8 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT jl_free_aligned(d); else free(d); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, jl_array_nbytes(a)); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -jl_array_nbytes(a)); gc_num.freed += jl_array_nbytes(a); gc_num.freecall++; } @@ -1250,6 +1267,8 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT set_page_metadata(pg); push_page_metadata_back(&ptls->page_metadata_allocd, pg); jl_taggedvalue_t *fl = gc_reset_page(ptls, p, pg); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_allocd, GC_PAGE_SZ); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, GC_PAGE_SZ); p->newpages = fl; return fl; } @@ -1443,8 +1462,12 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo } else if (freed_lazily) { push_page_metadata_back(lazily_freed, pg); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_freed, GC_PAGE_SZ); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ); } else { + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_freed, GC_PAGE_SZ); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ); #ifdef _P64 // only enable concurrent sweeping on 64bit if (jl_n_sweepthreads == 0) { jl_gc_free_page(pg); @@ -3056,6 +3079,11 @@ JL_DLLEXPORT int64_t jl_gc_live_bytes(void) return live_bytes; } +double jl_gc_smooth(uint64_t old_val, uint64_t new_val, double factor) +{ + return factor * old_val + (1.0-factor) * new_val; +} + size_t jl_maxrss(void); // Only one thread should be running in this function @@ -3070,6 +3098,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) jl_gc_markqueue_t *mq = &ptls->mark_queue; uint64_t gc_start_time = jl_hrtime(); + uint64_t mutator_time = gc_start_time - gc_end_time; + uint64_t before_free_heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size); int64_t last_perm_scanned_bytes = perm_scanned_bytes; uint64_t start_mark_time = jl_hrtime(); JL_PROBE_GC_MARK_BEGIN(); @@ -3160,19 +3190,14 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) uint64_t mark_time = end_mark_time - start_mark_time; gc_num.mark_time = mark_time; gc_num.total_mark_time += mark_time; - int64_t allocd = gc_num.allocd; gc_settime_postmark_end(); // marking is over // Flush everything in mark cache gc_sync_all_caches_nolock(ptls); - int64_t live_sz_ub = live_bytes + allocd; - int64_t live_sz_est = scanned_bytes + perm_scanned_bytes; - int64_t estimate_freed = live_sz_ub - live_sz_est; gc_verify(ptls); - gc_stats_all_pool(); gc_stats_big_obj(); objprofile_printall(); @@ -3181,42 +3206,17 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (!prev_sweep_full) promoted_bytes += perm_scanned_bytes - last_perm_scanned_bytes; // 5. next collection decision - int not_freed_enough = (collection == JL_GC_AUTO) && estimate_freed < (7*(allocd/10)); - int nptr = 0; + int remset_nptr = 0; + int sweep_full = next_sweep_full; + int recollect = 0; assert(gc_n_threads); for (int i = 0; i < gc_n_threads; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2 != NULL) - nptr += ptls2->heap.remset_nptr; + remset_nptr += ptls2->heap.remset_nptr; } + (void)remset_nptr; //Use this information for something? - // many pointers in the intergen frontier => "quick" mark is not quick - int large_frontier = nptr*sizeof(void*) >= default_collect_interval; - int sweep_full = 0; - int recollect = 0; - - // update heuristics only if this GC was automatically triggered - if (collection == JL_GC_AUTO) { - if (large_frontier) { - sweep_full = 1; - gc_num.interval = last_long_collect_interval; - } - if (not_freed_enough || large_frontier) { - gc_num.interval = gc_num.interval * 2; - } - - size_t maxmem = 0; -#ifdef _P64 - // on a big memory machine, increase max_collect_interval to totalmem / nthreads / 2 - maxmem = total_mem / (gc_n_threads - jl_n_gcthreads) / 2; -#endif - if (maxmem < max_collect_interval) - maxmem = max_collect_interval; - if (gc_num.interval > maxmem) { - sweep_full = 1; - gc_num.interval = maxmem; - } - } // If the live data outgrows the suggested max_total_memory // we keep going with minimum intervals and full gcs until @@ -3236,7 +3236,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) // on the first collection after sweep_full, and the current scan perm_scanned_bytes = 0; promoted_bytes = 0; - last_long_collect_interval = gc_num.interval; } scanned_bytes = 0; // 6. start sweeping @@ -3261,9 +3260,10 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (sweep_full) gc_sweep_perm_alloc(); } + JL_PROBE_GC_SWEEP_END(); - uint64_t gc_end_time = jl_hrtime(); + gc_end_time = jl_hrtime(); uint64_t pause = gc_end_time - gc_start_time; uint64_t sweep_time = gc_end_time - start_sweep_time; gc_num.total_sweep_time += sweep_time; @@ -3272,6 +3272,44 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) gc_num.last_full_sweep = gc_end_time; } + size_t heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size); + uint64_t alloc_diff = before_free_heap_size - old_heap_size; + uint64_t freed_diff = before_free_heap_size - heap_size; + + double alloc_smooth_factor = 0.95; + double collec_smooth_factor = 0.5; + double tuning_factor = 0.03; + double alloc_mem = jl_gc_smooth(old_alloc_diff, alloc_diff, alloc_smooth_factor); + double alloc_time = jl_gc_smooth(old_mut_time, mutator_time, alloc_smooth_factor); + double gc_mem = jl_gc_smooth(old_freed_diff, freed_diff, collec_smooth_factor); + double gc_time = jl_gc_smooth(old_pause_time, pause, collec_smooth_factor); + old_alloc_diff = alloc_diff; + old_mut_time = mutator_time; + old_freed_diff = freed_diff; + old_pause_time = pause; + old_heap_size = heap_size; + double min_interval = default_collect_interval; + double target_allocs; + if (alloc_mem != 0 && alloc_time != 0 && gc_mem != 0 && gc_time != 0) { + double alloc_rate = alloc_mem/alloc_time; + double gc_rate = gc_mem/gc_time; + target_allocs = sqrt(((double)heap_size/min_interval * alloc_rate)/(gc_rate * tuning_factor)); // work on multiples of min interval + } + else + target_allocs = 2*sqrt((double)heap_size/min_interval); + + uint64_t target_heap = (uint64_t)target_allocs*min_interval + heap_size; + if (target_heap > max_total_memory) + target_heap = max_total_memory; + else if (target_heap < default_collect_interval) + target_heap = default_collect_interval; + jl_atomic_store_relaxed(&gc_heap_stats.heap_target, target_heap); + + double old_ratio = (double)promoted_bytes/(double)heap_size; + if (heap_size > max_total_memory * 0.8 || old_ratio > 0.15) + next_sweep_full = 1; + else + next_sweep_full = 0; // sweeping is over // 7. if it is a quick sweep, put back the remembered objects in queued state // so that we don't trigger the barrier again on them. @@ -3303,55 +3341,19 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) } #endif - _report_gc_finished(pause, gc_num.freed, sweep_full, recollect); - - gc_final_pause_end(gc_start_time, gc_end_time); - gc_time_sweep_pause(gc_end_time, allocd, live_bytes, - estimate_freed, sweep_full); - gc_num.full_sweep += sweep_full; + _report_gc_finished(pause, gc_num.freed, sweep_full, recollect, live_bytes); uint64_t max_memory = last_live_bytes + gc_num.allocd; if (max_memory > gc_num.max_memory) { gc_num.max_memory = max_memory; } - + gc_final_pause_end(gc_start_time, gc_end_time); + gc_time_sweep_pause(gc_end_time, allocd, live_bytes, + estimate_freed, sweep_full); + gc_num.full_sweep += sweep_full; last_live_bytes = live_bytes; - // Can't call inc_live_bytes here because we already added allocd - // to the graph earlier live_bytes += -gc_num.freed + gc_num.allocd; jl_timing_counter_dec(JL_TIMING_COUNTER_HeapSize, gc_num.freed); - if (collection == JL_GC_AUTO) { - //If we aren't freeing enough or are seeing lots and lots of pointers let it increase faster - if (!not_freed_enough || large_frontier) { - int64_t tot = 2 * (live_bytes + gc_num.allocd) / 3; - if (gc_num.interval > tot) { - gc_num.interval = tot; - last_long_collect_interval = tot; - } - // If the current interval is larger than half the live data decrease the interval - } - else { - int64_t half = (live_bytes / 2); - if (gc_num.interval > half) - gc_num.interval = half; - } - - // But never go below default - if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval; - } - - if (gc_num.interval + live_bytes > max_total_memory) { - if (live_bytes < max_total_memory) { - gc_num.interval = max_total_memory - live_bytes; - last_long_collect_interval = max_total_memory - live_bytes; - } - else { - // We can't stay under our goal so let's go back to - // the minimum interval and hope things get better - gc_num.interval = default_collect_interval; - } - } - gc_time_summary(sweep_full, t_start, gc_end_time, gc_num.freed, live_bytes, gc_num.interval, pause, gc_num.time_to_safepoint, @@ -3542,7 +3544,7 @@ void jl_gc_init(void) arraylist_new(&finalizer_list_marked, 0); arraylist_new(&to_finalize, 0); - + jl_atomic_store_relaxed(&gc_heap_stats.heap_target, default_collect_interval); gc_num.interval = default_collect_interval; last_long_collect_interval = default_collect_interval; gc_num.allocd = 0; @@ -3558,7 +3560,7 @@ void jl_gc_init(void) if (total_mem < 128e9) percent = total_mem * 2.34375e-12 + 0.6; // 60% at 0 gigs and 90% at 128 to not else // overcommit too much on memory contrained devices - percent = 0.9; + percent = 0.8; max_total_memory = total_mem * percent; #endif if (jl_options.heap_size_hint) @@ -3571,7 +3573,11 @@ JL_DLLEXPORT void jl_gc_set_max_memory(uint64_t max_mem) { if (max_mem > 0 && max_mem < (uint64_t)1 << (sizeof(memsize_t) * 8 - 1)) { + #ifdef _P64 max_total_memory = max_mem; + #else + max_total_memory = max_mem < MAX32HEAP ? max_mem : MAX32HEAP; + #endif } } @@ -3599,6 +3605,8 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); jl_atomic_store_relaxed(&ptls->gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz); } return malloc(sz); } @@ -3614,6 +3622,8 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz); jl_atomic_store_relaxed(&ptls->gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, nm*sz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, nm*sz); } return calloc(nm, sz); } @@ -3629,6 +3639,8 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz); jl_atomic_store_relaxed(&ptls->gc_num.freecall, jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, sz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -sz); } } @@ -3647,6 +3659,9 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old)); jl_atomic_store_relaxed(&ptls->gc_num.realloc, jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, old); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old); } return realloc(p, sz); } @@ -3721,6 +3736,8 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); jl_atomic_store_relaxed(&ptls->gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz); int last_errno = errno; #ifdef _OS_WINDOWS_ DWORD last_error = GetLastError(); @@ -3759,7 +3776,9 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz)); jl_atomic_store_relaxed(&ptls->gc_num.realloc, jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1); - + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, oldsz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz); int last_errno = errno; #ifdef _OS_WINDOWS_ DWORD last_error = GetLastError(); @@ -3848,6 +3867,8 @@ static void *gc_perm_alloc_large(size_t sz, int zero, unsigned align, unsigned o #ifdef _OS_WINDOWS_ SetLastError(last_error); #endif + jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_allocd,sz); + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size,sz); errno = last_errno; jl_may_leak(base); assert(align > 0); @@ -3891,6 +3912,7 @@ void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offs errno = last_errno; if (__unlikely(pool == MAP_FAILED)) return NULL; + jl_atomic_fetch_add_relaxed(&gc_heap_stats.pages_perm_allocd, 1); #endif gc_perm_pool = (uintptr_t)pool; gc_perm_end = gc_perm_pool + GC_PERM_POOL_SIZE; diff --git a/src/gc.h b/src/gc.h index b1eee5a1d5bda..6e95cc93711b8 100644 --- a/src/gc.h +++ b/src/gc.h @@ -10,6 +10,7 @@ #define JL_GC_H #include +#include #include #include #include @@ -257,6 +258,18 @@ typedef struct { pagetable1_t *meta1[REGION2_PG_COUNT]; } pagetable_t; +typedef struct { + _Atomic(size_t) bytes_mapped; + _Atomic(size_t) bytes_resident; + _Atomic(size_t) bytes_freed; + _Atomic(size_t) bytes_allocd; + _Atomic(size_t) bytes_mallocd; + _Atomic(size_t) malloc_bytes_freed; + _Atomic(size_t) pages_perm_allocd; + _Atomic(size_t) heap_size; + _Atomic(size_t) heap_target; +} gc_heapstatus_t; + #define GC_PAGE_UNMAPPED 0 #define GC_PAGE_ALLOCATED 1 #define GC_PAGE_LAZILY_FREED 2 @@ -374,6 +387,7 @@ extern int64_t lazy_freed_pages; extern int gc_first_tid; extern int gc_n_threads; extern jl_ptls_t* gc_all_tls_states; +extern gc_heapstatus_t gc_heap_stats; STATIC_INLINE bigval_t *bigval_header(jl_taggedvalue_t *o) JL_NOTSAFEPOINT { @@ -637,7 +651,7 @@ void gc_count_pool(void); size_t jl_array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT; JL_DLLEXPORT void jl_enable_gc_logging(int enable); -void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT; +void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT; #ifdef __cplusplus } From 2f42cd517f1507306f3260ffe95b3be46665e66e Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 5 Jul 2023 17:07:36 -0300 Subject: [PATCH 02/11] Relax some of the atomics for sweeping --- src/gc.c | 10 ++++++++-- test/testenv.jl | 9 +++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/gc.c b/src/gc.c index 325c4b023c824..211fe2c16d6e6 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1057,8 +1057,10 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT if (nxt) nxt->prev = pv; gc_num.freed += v->sz&~3; - jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, v->sz&~3); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(v->sz&~3)); + jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, + jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed) + (v->sz&~3)); + jl_atomic_store_relaxed(&gc_heap_stats.heap_size, + jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - (v->sz&~3)); #ifdef MEMDEBUG memset(v, 0xbb, v->sz&~3); #endif @@ -1194,6 +1196,10 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT jl_free_aligned(d); else free(d); + jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, + jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed) + jl_array_nbytes(a)); + jl_atomic_store_relaxed(&gc_heap_stats.heap_size, + jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_array_nbytes(a)); jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, jl_array_nbytes(a)); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -jl_array_nbytes(a)); gc_num.freed += jl_array_nbytes(a); diff --git a/test/testenv.jl b/test/testenv.jl index 41706dd24e75e..21f3ee3b31dd1 100644 --- a/test/testenv.jl +++ b/test/testenv.jl @@ -37,6 +37,15 @@ if !@isdefined(testenv_defined) function addprocs_with_testenv(X; rr_allowed=true, kwargs...) exename = rr_allowed ? `$rr_exename $test_exename` : test_exename + if X isa Integer + if Sys.iswindows() + heap_size=round(Int,(Sys.free_memory()/(1024^2)/(X+1))) + heap_size -= 300 # I don't know anymore + else + heap_size=round(Int,(Sys.total_memory()/(1024^2)/(X+1))) + end + push!(test_exeflags.exec, "--heap-size-hint=$(heap_size)M") + end addprocs(X; exename=exename, exeflags=test_exeflags, kwargs...) end From 01d6e3910869c77a9d552fcdfb618fe6baec3b03 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Fri, 7 Jul 2023 11:03:51 -0300 Subject: [PATCH 03/11] Apply review suggestions. --- src/gc-debug.c | 13 ++++------ src/gc.c | 64 +++++++++++++++++--------------------------------- src/gc.h | 5 ---- 3 files changed, 26 insertions(+), 56 deletions(-) diff --git a/src/gc-debug.c b/src/gc-debug.c index 56441ae09b8e6..6e1587b17a6d9 100644 --- a/src/gc-debug.c +++ b/src/gc-debug.c @@ -1229,19 +1229,14 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect recollect ? "recollect" : "" ); - jl_safe_printf("Heap stats: bytes_mapped %.2f MB, bytes_allocd %.2f MB\nbytes_freed %.2f MB, bytes_mallocd %.1f, malloc_bytes_freed %.2f MB\npages_perm_allocd %zu, heap_size %.2f MB, heap_target %.2f MB, live_bytes %.2f MB\n", + jl_safe_printf("Heap stats: bytes_mapped %.2f MB, bytes_resident %.2f MB, heap_size %.2f MB, heap_target %.2f MB, live_bytes %.2f MB\n, Fragmentation %.3f", jl_atomic_load_relaxed(&gc_heap_stats.bytes_mapped)/(double)(1<<20), - jl_atomic_load_relaxed(&gc_heap_stats.bytes_allocd)/(double)(1<<20), - jl_atomic_load_relaxed(&gc_heap_stats.bytes_freed)/(double)(1<<20), - jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd)/(double)(1<<20), - jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed)/(double)(1<<20), - jl_atomic_load_relaxed(&gc_heap_stats.pages_perm_allocd), + jl_atomic_load_relaxed(&gc_heap_stats.bytes_resident)/(double)(1<<20), jl_atomic_load_relaxed(&gc_heap_stats.heap_size)/(double)(1<<20), jl_atomic_load_relaxed(&gc_heap_stats.heap_target)/(double)(1<<20), - live_bytes/(double)(1<<20) + live_bytes/(double)(1<<20), + (double)live_bytes/(double)jl_atomic_load_relaxed(&gc_heap_stats.heap_size) ); - double bytes_mapped = (jl_atomic_load_relaxed(&gc_heap_stats.bytes_resident) + jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd) - jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed))/(double)(1<<20); - jl_safe_printf("Fragmentation %f, mapped_bytes %.2f MB\n", (double)live_bytes/(double)jl_atomic_load_relaxed(&gc_heap_stats.heap_size), bytes_mapped); // Should fragmentation use bytes_resident instead of heap_size? } diff --git a/src/gc.c b/src/gc.c index 211fe2c16d6e6..3e1dcc9beeff4 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1010,7 +1010,6 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); jl_atomic_store_relaxed(&ptls->gc_num.bigalloc, jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz); #ifdef MEMDEBUG memset(v, 0xee, allocsz); @@ -1057,8 +1056,6 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT if (nxt) nxt->prev = pv; gc_num.freed += v->sz&~3; - jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, - jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed) + (v->sz&~3)); jl_atomic_store_relaxed(&gc_heap_stats.heap_size, jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - (v->sz&~3)); #ifdef MEMDEBUG @@ -1120,7 +1117,6 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT jl_ptls_t ptls = jl_current_task->ptls; jl_atomic_store_relaxed(&ptls->gc_num.allocd, jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz); } @@ -1196,12 +1192,8 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT jl_free_aligned(d); else free(d); - jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, - jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed) + jl_array_nbytes(a)); jl_atomic_store_relaxed(&gc_heap_stats.heap_size, jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_array_nbytes(a)); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, jl_array_nbytes(a)); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -jl_array_nbytes(a)); gc_num.freed += jl_array_nbytes(a); gc_num.freecall++; } @@ -1273,7 +1265,6 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT set_page_metadata(pg); push_page_metadata_back(&ptls->page_metadata_allocd, pg); jl_taggedvalue_t *fl = gc_reset_page(ptls, p, pg); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_allocd, GC_PAGE_SZ); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, GC_PAGE_SZ); p->newpages = fl; return fl; @@ -1468,11 +1459,9 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo } else if (freed_lazily) { push_page_metadata_back(lazily_freed, pg); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_freed, GC_PAGE_SZ); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ); } else { - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_freed, GC_PAGE_SZ); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ); #ifdef _P64 // only enable concurrent sweeping on 64bit if (jl_n_sweepthreads == 0) { @@ -3279,29 +3268,30 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) } size_t heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size); - uint64_t alloc_diff = before_free_heap_size - old_heap_size; - uint64_t freed_diff = before_free_heap_size - heap_size; - - double alloc_smooth_factor = 0.95; - double collec_smooth_factor = 0.5; - double tuning_factor = 0.03; - double alloc_mem = jl_gc_smooth(old_alloc_diff, alloc_diff, alloc_smooth_factor); - double alloc_time = jl_gc_smooth(old_mut_time, mutator_time, alloc_smooth_factor); - double gc_mem = jl_gc_smooth(old_freed_diff, freed_diff, collec_smooth_factor); - double gc_time = jl_gc_smooth(old_pause_time, pause, collec_smooth_factor); - old_alloc_diff = alloc_diff; - old_mut_time = mutator_time; - old_freed_diff = freed_diff; - old_pause_time = pause; - old_heap_size = heap_size; + double target_allocs = 0.0; double min_interval = default_collect_interval; - double target_allocs; - if (alloc_mem != 0 && alloc_time != 0 && gc_mem != 0 && gc_time != 0) { - double alloc_rate = alloc_mem/alloc_time; - double gc_rate = gc_mem/gc_time; - target_allocs = sqrt(((double)heap_size/min_interval * alloc_rate)/(gc_rate * tuning_factor)); // work on multiples of min interval + if (collection == JL_GC_AUTO) { + uint64_t alloc_diff = before_free_heap_size - old_heap_size; + uint64_t freed_diff = before_free_heap_size - heap_size; + double alloc_smooth_factor = 0.95; + double collect_smooth_factor = 0.5; + double tuning_factor = 0.03; + double alloc_mem = jl_gc_smooth(old_alloc_diff, alloc_diff, alloc_smooth_factor); + double alloc_time = jl_gc_smooth(old_mut_time, mutator_time, alloc_smooth_factor); + double gc_mem = jl_gc_smooth(old_freed_diff, freed_diff, collect_smooth_factor); + double gc_time = jl_gc_smooth(old_pause_time, pause, collect_smooth_factor); + old_alloc_diff = alloc_diff; + old_mut_time = mutator_time; + old_freed_diff = freed_diff; + old_pause_time = pause; + old_heap_size = heap_size; + if (alloc_mem != 0 && alloc_time != 0 && gc_mem != 0 && gc_time != 0 ) { + double alloc_rate = alloc_mem/alloc_time; + double gc_rate = gc_mem/gc_time; + target_allocs = sqrt(((double)heap_size/min_interval * alloc_rate)/(gc_rate * tuning_factor)); // work on multiples of min interval + } } - else + if (target_allocs == 0.0) target_allocs = 2*sqrt((double)heap_size/min_interval); uint64_t target_heap = (uint64_t)target_allocs*min_interval + heap_size; @@ -3611,7 +3601,6 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); jl_atomic_store_relaxed(&ptls->gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz); } return malloc(sz); @@ -3628,7 +3617,6 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz); jl_atomic_store_relaxed(&ptls->gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, nm*sz); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, nm*sz); } return calloc(nm, sz); @@ -3645,7 +3633,6 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz); jl_atomic_store_relaxed(&ptls->gc_num.freecall, jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, sz); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -sz); } } @@ -3665,8 +3652,6 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old)); jl_atomic_store_relaxed(&ptls->gc_num.realloc, jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, old); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old); } return realloc(p, sz); @@ -3742,7 +3727,6 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); jl_atomic_store_relaxed(&ptls->gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz); int last_errno = errno; #ifdef _OS_WINDOWS_ @@ -3782,8 +3766,6 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz)); jl_atomic_store_relaxed(&ptls->gc_num.realloc, jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, oldsz); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz); int last_errno = errno; #ifdef _OS_WINDOWS_ @@ -3873,7 +3855,6 @@ static void *gc_perm_alloc_large(size_t sz, int zero, unsigned align, unsigned o #ifdef _OS_WINDOWS_ SetLastError(last_error); #endif - jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_allocd,sz); jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size,sz); errno = last_errno; jl_may_leak(base); @@ -3918,7 +3899,6 @@ void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offs errno = last_errno; if (__unlikely(pool == MAP_FAILED)) return NULL; - jl_atomic_fetch_add_relaxed(&gc_heap_stats.pages_perm_allocd, 1); #endif gc_perm_pool = (uintptr_t)pool; gc_perm_end = gc_perm_pool + GC_PERM_POOL_SIZE; diff --git a/src/gc.h b/src/gc.h index 6e95cc93711b8..4bfe5dc328e9d 100644 --- a/src/gc.h +++ b/src/gc.h @@ -261,11 +261,6 @@ typedef struct { typedef struct { _Atomic(size_t) bytes_mapped; _Atomic(size_t) bytes_resident; - _Atomic(size_t) bytes_freed; - _Atomic(size_t) bytes_allocd; - _Atomic(size_t) bytes_mallocd; - _Atomic(size_t) malloc_bytes_freed; - _Atomic(size_t) pages_perm_allocd; _Atomic(size_t) heap_size; _Atomic(size_t) heap_target; } gc_heapstatus_t; From b033c0a92564b95103c885d328bc9a46af687e8b Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Fri, 7 Jul 2023 12:14:30 -0300 Subject: [PATCH 04/11] Add batching for counted functions --- src/gc.c | 33 ++++++++++++++++++++++++--------- src/julia_threads.h | 4 ++-- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/gc.c b/src/gc.c index 3e1dcc9beeff4..35111b84823cf 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1130,12 +1130,13 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT jl_ptls_t ptls = gc_all_tls_states[i]; if (ptls) { dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval); - dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed); dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc); dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc); dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc); dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc); - dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall); + uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc); + uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc); + jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_acc - free_acc + jl_atomic_load_relaxed(&gc_heap_stats.heap_size)); } } } @@ -3601,7 +3602,13 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); jl_atomic_store_relaxed(&ptls->gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz); + uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc); + if (alloc_acc + sz < 16*1024) + jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + sz); + else { + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + sz); + jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0); + } } return malloc(sz); } @@ -3617,7 +3624,13 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz); jl_atomic_store_relaxed(&ptls->gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, nm*sz); + uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc); + if (alloc_acc + sz < 16*1024) + jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + sz * nm); + else { + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + sz * nm); + jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0); + } } return calloc(nm, sz); } @@ -3629,11 +3642,13 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) free(p); if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; - jl_atomic_store_relaxed(&ptls->gc_num.freed, - jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz); - jl_atomic_store_relaxed(&ptls->gc_num.freecall, - jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -sz); + uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc); + if (free_acc + sz < 16*1024) + jl_atomic_store_relaxed(&ptls->gc_num.free_acc, free_acc + sz); + else { + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_acc + sz)); + jl_atomic_store_relaxed(&ptls->gc_num.free_acc, 0); + } } } diff --git a/src/julia_threads.h b/src/julia_threads.h index f4c235243e684..d4cbb88e619ba 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -130,12 +130,12 @@ typedef struct { typedef struct { _Atomic(int64_t) allocd; - _Atomic(int64_t) freed; _Atomic(uint64_t) malloc; _Atomic(uint64_t) realloc; _Atomic(uint64_t) poolalloc; _Atomic(uint64_t) bigalloc; - _Atomic(uint64_t) freecall; + _Atomic(int64_t) free_acc; + _Atomic(uint64_t) alloc_acc; } jl_thread_gc_num_t; typedef struct { From 2ab7f9568504b466373e487e5c9ea19b3dbfc38b Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Fri, 7 Jul 2023 12:18:07 -0300 Subject: [PATCH 05/11] Small fixup --- src/gc.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/gc.c b/src/gc.c index 35111b84823cf..3bbb30ddb723f 100644 --- a/src/gc.c +++ b/src/gc.c @@ -3659,10 +3659,7 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size if (pgcstack != NULL && ct->world_age) { jl_ptls_t ptls = ct->ptls; maybe_collect(ptls); - if (sz < old) - jl_atomic_store_relaxed(&ptls->gc_num.freed, - jl_atomic_load_relaxed(&ptls->gc_num.freed) + (old - sz)); - else + if (!(sz < old)) jl_atomic_store_relaxed(&ptls->gc_num.allocd, jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old)); jl_atomic_store_relaxed(&ptls->gc_num.realloc, @@ -3773,10 +3770,7 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz; inc_live_bytes(allocsz - oldsz); } - else if (allocsz < oldsz) - jl_atomic_store_relaxed(&ptls->gc_num.freed, - jl_atomic_load_relaxed(&ptls->gc_num.freed) + (oldsz - allocsz)); - else + else if (!(allocsz < oldsz)) jl_atomic_store_relaxed(&ptls->gc_num.allocd, jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz)); jl_atomic_store_relaxed(&ptls->gc_num.realloc, From 7af549ebf929d48b6ad9e0d7bce2e01759e35eac Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Fri, 7 Jul 2023 13:03:22 -0300 Subject: [PATCH 06/11] Batch other big allocations as well --- src/gc.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/src/gc.c b/src/gc.c index 3bbb30ddb723f..d77712b82483f 100644 --- a/src/gc.c +++ b/src/gc.c @@ -1010,7 +1010,13 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); jl_atomic_store_relaxed(&ptls->gc_num.bigalloc, jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz); + uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc); + if (alloc_acc + allocsz < 16*1024) + jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + allocsz); + else { + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + allocsz); + jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0); + } #ifdef MEMDEBUG memset(v, 0xee, allocsz); #endif @@ -1117,7 +1123,13 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT jl_ptls_t ptls = jl_current_task->ptls; jl_atomic_store_relaxed(&ptls->gc_num.allocd, jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz); + uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc); + if (alloc_acc + sz < 16*1024) + jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + sz); + else { + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + sz); + jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0); + } } static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT @@ -3739,7 +3751,13 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); jl_atomic_store_relaxed(&ptls->gc_num.malloc, jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); - jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz); + uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc); + if (alloc_acc + allocsz < 16*1024) + jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + allocsz); + else { + jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + allocsz); + jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0); + } int last_errno = errno; #ifdef _OS_WINDOWS_ DWORD last_error = GetLastError(); From 4da775f5daddad72e16d7596a9e0acff3919b3ca Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Thu, 13 Jul 2023 20:59:16 -0300 Subject: [PATCH 07/11] Readd testenv changes --- batch.diff | 255 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 batch.diff diff --git a/batch.diff b/batch.diff new file mode 100644 index 0000000000000..e7bdb4fc2e80c --- /dev/null +++ b/batch.diff @@ -0,0 +1,255 @@ +diff --git a/src/gc.c b/src/gc.c +index c85d1e5455..c82b2b645d 100644 +--- a/src/gc.c ++++ b/src/gc.c +@@ -6,6 +6,8 @@ + #include "julia_gcext.h" + #include "julia_assert.h" + #include ++#include ++#include + #include + #ifdef __GLIBC__ + #include // for malloc_trim +@@ -1004,8 +1006,14 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) + jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); + jl_atomic_store_relaxed(&ptls->gc_num.bigalloc, + jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz); ++ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); ++ if (alloc_thresh + sz < 128*1024) ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz); ++ else { ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz); ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz); ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); ++ } + #ifdef MEMDEBUG + memset(v, 0xee, allocsz); + #endif +@@ -1051,8 +1059,10 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT + if (nxt) + nxt->prev = pv; + gc_num.freed += v->sz&~3; +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, v->sz&~3); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(v->sz&~3)); ++ jl_atomic_store_relaxed(&gc_heap_stats.heap_size, ++ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) + (v->sz&~3)); ++ jl_atomic_store_relaxed(&gc_heap_stats.heap_size, ++ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) + (v->sz&~3)); + #ifdef MEMDEBUG + memset(v, 0xbb, v->sz&~3); + #endif +@@ -1112,8 +1122,14 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT + jl_ptls_t ptls = jl_current_task->ptls; + jl_atomic_store_relaxed(&ptls->gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz); ++ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); ++ if (alloc_thresh + sz < 128*1024) ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz); ++ else { ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz); ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz); ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); ++ } + } + + static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT +@@ -1126,12 +1142,15 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT + jl_ptls_t ptls = gc_all_tls_states[i]; + if (ptls) { + dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval); +- dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed); + dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc); + dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc); + dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc); + dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc); +- dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall); ++ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); ++ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh); ++ jl_atomic_store_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd)); ++ jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed)); ++ jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_thresh - free_thresh + jl_atomic_load_relaxed(&gc_heap_stats.heap_size)); + } + } + } +@@ -1188,8 +1207,10 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT + jl_free_aligned(d); + else + free(d); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, jl_array_nbytes(a)); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -jl_array_nbytes(a)); ++ jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, ++ jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed) + jl_array_nbytes(a)); ++ jl_atomic_store_relaxed(&gc_heap_stats.heap_size, ++ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_array_nbytes(a)); + gc_num.freed += jl_array_nbytes(a); + gc_num.freecall++; + } +@@ -3589,8 +3610,14 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) + jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); + jl_atomic_store_relaxed(&ptls->gc_num.malloc, + jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz); ++ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); ++ if (alloc_thresh + sz < 128*1024) ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz); ++ else { ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz); ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz); ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); ++ } + } + return malloc(sz); + } +@@ -3606,8 +3633,14 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) + jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz); + jl_atomic_store_relaxed(&ptls->gc_num.malloc, + jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, nm*sz); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, nm*sz); ++ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); ++ if (alloc_thresh + sz*nm < 128*1024) ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz*nm); ++ else { ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz*nm); ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz*nm); ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); ++ } + } + return calloc(nm, sz); + } +@@ -3619,12 +3652,15 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) + free(p); + if (pgcstack != NULL && ct->world_age) { + jl_ptls_t ptls = ct->ptls; +- jl_atomic_store_relaxed(&ptls->gc_num.freed, +- jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz); +- jl_atomic_store_relaxed(&ptls->gc_num.freecall, +- jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, sz); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -sz); ++ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh); ++ if (free_thresh + sz < 128*1024) { ++ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + sz); ++ } ++ else { ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + sz); ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + sz)); ++ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0); ++ } + } + } + +@@ -3635,17 +3671,28 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size + if (pgcstack != NULL && ct->world_age) { + jl_ptls_t ptls = ct->ptls; + maybe_collect(ptls); +- if (sz < old) +- jl_atomic_store_relaxed(&ptls->gc_num.freed, +- jl_atomic_load_relaxed(&ptls->gc_num.freed) + (old - sz)); +- else ++ if (!(sz < old)) + jl_atomic_store_relaxed(&ptls->gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old)); + jl_atomic_store_relaxed(&ptls->gc_num.realloc, + jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, old); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old); ++ ++ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh); ++ if (free_thresh + old < 128*1024) ++ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + old); ++ else { ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + old); ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + old)); ++ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0); ++ } ++ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); ++ if (alloc_thresh + sz < 128*1024) ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz); ++ else { ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz); ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz); ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); ++ } + } + return realloc(p, sz); + } +@@ -3720,8 +3767,14 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) + jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); + jl_atomic_store_relaxed(&ptls->gc_num.malloc, + jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz); ++ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); ++ if (alloc_thresh + sz < 128*1024) ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz); ++ else { ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz); ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz); ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); ++ } + int last_errno = errno; + #ifdef _OS_WINDOWS_ + DWORD last_error = GetLastError(); +@@ -3752,17 +3805,28 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds + ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz; + inc_live_bytes(allocsz - oldsz); + } +- else if (allocsz < oldsz) +- jl_atomic_store_relaxed(&ptls->gc_num.freed, +- jl_atomic_load_relaxed(&ptls->gc_num.freed) + (oldsz - allocsz)); +- else ++ else if (!(allocsz < oldsz)) + jl_atomic_store_relaxed(&ptls->gc_num.allocd, + jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz)); + jl_atomic_store_relaxed(&ptls->gc_num.realloc, + jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, oldsz); +- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz); ++ ++ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh); ++ if (free_thresh + oldsz < 128*1024) ++ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + oldsz); ++ else { ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + oldsz); ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + oldsz)); ++ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0); ++ } ++ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); ++ if (alloc_thresh + allocsz < 128*1024) ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + allocsz); ++ else { ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + allocsz); ++ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + allocsz); ++ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); ++ } + int last_errno = errno; + #ifdef _OS_WINDOWS_ + DWORD last_error = GetLastError(); +diff --git a/src/julia_threads.h b/src/julia_threads.h +index f4c235243e..a672a92fb9 100644 +--- a/src/julia_threads.h ++++ b/src/julia_threads.h +@@ -130,12 +130,12 @@ typedef struct { + + typedef struct { + _Atomic(int64_t) allocd; +- _Atomic(int64_t) freed; + _Atomic(uint64_t) malloc; + _Atomic(uint64_t) realloc; + _Atomic(uint64_t) poolalloc; + _Atomic(uint64_t) bigalloc; +- _Atomic(uint64_t) freecall; ++ _Atomic(int64_t) free_thresh; // fiels used to batch fetch add operations for the GC ++ _Atomic(uint64_t) alloc_thresh; + } jl_thread_gc_num_t; + + typedef struct { From b4bab03d3ae163f9a98d4627213fc69ae273dae6 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Fri, 14 Jul 2023 09:48:48 -0300 Subject: [PATCH 08/11] Let the heap increase a bit if we are thrashing --- batch.diff | 255 ----------------------------------------------------- src/gc.c | 8 +- 2 files changed, 5 insertions(+), 258 deletions(-) delete mode 100644 batch.diff diff --git a/batch.diff b/batch.diff deleted file mode 100644 index e7bdb4fc2e80c..0000000000000 --- a/batch.diff +++ /dev/null @@ -1,255 +0,0 @@ -diff --git a/src/gc.c b/src/gc.c -index c85d1e5455..c82b2b645d 100644 ---- a/src/gc.c -+++ b/src/gc.c -@@ -6,6 +6,8 @@ - #include "julia_gcext.h" - #include "julia_assert.h" - #include -+#include -+#include - #include - #ifdef __GLIBC__ - #include // for malloc_trim -@@ -1004,8 +1006,14 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz) - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); - jl_atomic_store_relaxed(&ptls->gc_num.bigalloc, - jl_atomic_load_relaxed(&ptls->gc_num.bigalloc) + 1); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz); -+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); -+ if (alloc_thresh + sz < 128*1024) -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz); -+ else { -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz); -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz); -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); -+ } - #ifdef MEMDEBUG - memset(v, 0xee, allocsz); - #endif -@@ -1051,8 +1059,10 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT - if (nxt) - nxt->prev = pv; - gc_num.freed += v->sz&~3; -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, v->sz&~3); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(v->sz&~3)); -+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size, -+ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) + (v->sz&~3)); -+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size, -+ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) + (v->sz&~3)); - #ifdef MEMDEBUG - memset(v, 0xbb, v->sz&~3); - #endif -@@ -1112,8 +1122,14 @@ void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT - jl_ptls_t ptls = jl_current_task->ptls; - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz); -+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); -+ if (alloc_thresh + sz < 128*1024) -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz); -+ else { -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz); -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz); -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); -+ } - } - - static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT -@@ -1126,12 +1142,15 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT - jl_ptls_t ptls = gc_all_tls_states[i]; - if (ptls) { - dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + gc_num.interval); -- dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed); - dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc); - dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc); - dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc); - dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc); -- dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall); -+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); -+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh); -+ jl_atomic_store_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd)); -+ jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed)); -+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_thresh - free_thresh + jl_atomic_load_relaxed(&gc_heap_stats.heap_size)); - } - } - } -@@ -1188,8 +1207,10 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT - jl_free_aligned(d); - else - free(d); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, jl_array_nbytes(a)); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -jl_array_nbytes(a)); -+ jl_atomic_store_relaxed(&gc_heap_stats.malloc_bytes_freed, -+ jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed) + jl_array_nbytes(a)); -+ jl_atomic_store_relaxed(&gc_heap_stats.heap_size, -+ jl_atomic_load_relaxed(&gc_heap_stats.heap_size) - jl_array_nbytes(a)); - gc_num.freed += jl_array_nbytes(a); - gc_num.freecall++; - } -@@ -3589,8 +3610,14 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + sz); - jl_atomic_store_relaxed(&ptls->gc_num.malloc, - jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz); -+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); -+ if (alloc_thresh + sz < 128*1024) -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz); -+ else { -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz); -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz); -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); -+ } - } - return malloc(sz); - } -@@ -3606,8 +3633,14 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + nm*sz); - jl_atomic_store_relaxed(&ptls->gc_num.malloc, - jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, nm*sz); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, nm*sz); -+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); -+ if (alloc_thresh + sz*nm < 128*1024) -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz*nm); -+ else { -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz*nm); -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz*nm); -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); -+ } - } - return calloc(nm, sz); - } -@@ -3619,12 +3652,15 @@ JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) - free(p); - if (pgcstack != NULL && ct->world_age) { - jl_ptls_t ptls = ct->ptls; -- jl_atomic_store_relaxed(&ptls->gc_num.freed, -- jl_atomic_load_relaxed(&ptls->gc_num.freed) + sz); -- jl_atomic_store_relaxed(&ptls->gc_num.freecall, -- jl_atomic_load_relaxed(&ptls->gc_num.freecall) + 1); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, sz); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -sz); -+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh); -+ if (free_thresh + sz < 128*1024) { -+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + sz); -+ } -+ else { -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + sz); -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + sz)); -+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0); -+ } - } - } - -@@ -3635,17 +3671,28 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size - if (pgcstack != NULL && ct->world_age) { - jl_ptls_t ptls = ct->ptls; - maybe_collect(ptls); -- if (sz < old) -- jl_atomic_store_relaxed(&ptls->gc_num.freed, -- jl_atomic_load_relaxed(&ptls->gc_num.freed) + (old - sz)); -- else -+ if (!(sz < old)) - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old)); - jl_atomic_store_relaxed(&ptls->gc_num.realloc, - jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, old); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old); -+ -+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh); -+ if (free_thresh + old < 128*1024) -+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + old); -+ else { -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + old); -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + old)); -+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0); -+ } -+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); -+ if (alloc_thresh + sz < 128*1024) -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz); -+ else { -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz); -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz); -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); -+ } - } - return realloc(p, sz); - } -@@ -3720,8 +3767,14 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + allocsz); - jl_atomic_store_relaxed(&ptls->gc_num.malloc, - jl_atomic_load_relaxed(&ptls->gc_num.malloc) + 1); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz); -+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); -+ if (alloc_thresh + sz < 128*1024) -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + sz); -+ else { -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + sz); -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + sz); -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); -+ } - int last_errno = errno; - #ifdef _OS_WINDOWS_ - DWORD last_error = GetLastError(); -@@ -3752,17 +3805,28 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds - ptls->gc_cache.perm_scanned_bytes += allocsz - oldsz; - inc_live_bytes(allocsz - oldsz); - } -- else if (allocsz < oldsz) -- jl_atomic_store_relaxed(&ptls->gc_num.freed, -- jl_atomic_load_relaxed(&ptls->gc_num.freed) + (oldsz - allocsz)); -- else -+ else if (!(allocsz < oldsz)) - jl_atomic_store_relaxed(&ptls->gc_num.allocd, - jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz)); - jl_atomic_store_relaxed(&ptls->gc_num.realloc, - jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, allocsz); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, oldsz); -- jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz); -+ -+ uint64_t free_thresh = jl_atomic_load_relaxed(&ptls->gc_num.free_thresh); -+ if (free_thresh + oldsz < 128*1024) -+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, free_thresh + oldsz); -+ else { -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, free_thresh + oldsz); -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_thresh + oldsz)); -+ jl_atomic_store_relaxed(&ptls->gc_num.free_thresh, 0); -+ } -+ uint64_t alloc_thresh = jl_atomic_load_relaxed(&ptls->gc_num.alloc_thresh); -+ if (alloc_thresh + allocsz < 128*1024) -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, alloc_thresh + allocsz); -+ else { -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, alloc_thresh + allocsz); -+ jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_thresh + allocsz); -+ jl_atomic_store_relaxed(&ptls->gc_num.alloc_thresh, 0); -+ } - int last_errno = errno; - #ifdef _OS_WINDOWS_ - DWORD last_error = GetLastError(); -diff --git a/src/julia_threads.h b/src/julia_threads.h -index f4c235243e..a672a92fb9 100644 ---- a/src/julia_threads.h -+++ b/src/julia_threads.h -@@ -130,12 +130,12 @@ typedef struct { - - typedef struct { - _Atomic(int64_t) allocd; -- _Atomic(int64_t) freed; - _Atomic(uint64_t) malloc; - _Atomic(uint64_t) realloc; - _Atomic(uint64_t) poolalloc; - _Atomic(uint64_t) bigalloc; -- _Atomic(uint64_t) freecall; -+ _Atomic(int64_t) free_thresh; // fiels used to batch fetch add operations for the GC -+ _Atomic(uint64_t) alloc_thresh; - } jl_thread_gc_num_t; - - typedef struct { diff --git a/src/gc.c b/src/gc.c index d77712b82483f..eb728abf1da34 100644 --- a/src/gc.c +++ b/src/gc.c @@ -3279,7 +3279,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (sweep_full) { gc_num.last_full_sweep = gc_end_time; } - + + int thrashing = 0; // maybe we should report this to the user or error out? size_t heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size); double target_allocs = 0.0; double min_interval = default_collect_interval; @@ -3298,17 +3299,18 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) old_freed_diff = freed_diff; old_pause_time = pause; old_heap_size = heap_size; + thrashing = gc_time > mutator_time * 98 ? 1 : 0; if (alloc_mem != 0 && alloc_time != 0 && gc_mem != 0 && gc_time != 0 ) { double alloc_rate = alloc_mem/alloc_time; double gc_rate = gc_mem/gc_time; target_allocs = sqrt(((double)heap_size/min_interval * alloc_rate)/(gc_rate * tuning_factor)); // work on multiples of min interval } } - if (target_allocs == 0.0) + if (target_allocs == 0.0 || thrashing) // If we are thrashing go back to default target_allocs = 2*sqrt((double)heap_size/min_interval); uint64_t target_heap = (uint64_t)target_allocs*min_interval + heap_size; - if (target_heap > max_total_memory) + if (target_heap > max_total_memory && !thrashing) // Allow it to go over if we are thrashing if we die we die target_heap = max_total_memory; else if (target_heap < default_collect_interval) target_heap = default_collect_interval; From 2cbba34516744c737f3ba11c99f330c46dd3ac10 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Fri, 14 Jul 2023 18:24:57 -0300 Subject: [PATCH 09/11] Whitespace --- src/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gc.c b/src/gc.c index eb728abf1da34..04dd05f32559c 100644 --- a/src/gc.c +++ b/src/gc.c @@ -3279,7 +3279,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (sweep_full) { gc_num.last_full_sweep = gc_end_time; } - + int thrashing = 0; // maybe we should report this to the user or error out? size_t heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size); double target_allocs = 0.0; From e6a18f3e937b25677a04644846ad4fbac5205490 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Mon, 17 Jul 2023 18:42:15 -0300 Subject: [PATCH 10/11] Experiment with not setting a max size by default at all --- src/gc.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/gc.c b/src/gc.c index 8a1d42517c554..56baf8f2cabfe 100644 --- a/src/gc.c +++ b/src/gc.c @@ -3584,12 +3584,6 @@ void jl_gc_init(void) uint64_t constrained_mem = uv_get_constrained_memory(); if (constrained_mem > 0 && constrained_mem < total_mem) total_mem = constrained_mem; - double percent; - if (total_mem < 128e9) - percent = total_mem * 2.34375e-12 + 0.6; // 60% at 0 gigs and 90% at 128 to not - else // overcommit too much on memory contrained devices - percent = 0.8; - max_total_memory = total_mem * percent; #endif if (jl_options.heap_size_hint) jl_gc_set_max_memory(jl_options.heap_size_hint); From 15b34a5768f330d581472c461be2d663b794f5fa Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Tue, 18 Jul 2023 22:11:18 -0300 Subject: [PATCH 11/11] Add under pressure callback --- src/gc.c | 17 +++++++++++++++++ src/jl_exported_funcs.inc | 1 + 2 files changed, 18 insertions(+) diff --git a/src/gc.c b/src/gc.c index 56baf8f2cabfe..a37101904683d 100644 --- a/src/gc.c +++ b/src/gc.c @@ -42,6 +42,8 @@ static jl_gc_callback_list_t *gc_cblist_pre_gc; static jl_gc_callback_list_t *gc_cblist_post_gc; static jl_gc_callback_list_t *gc_cblist_notify_external_alloc; static jl_gc_callback_list_t *gc_cblist_notify_external_free; +static jl_gc_callback_list_t *gc_cblist_notify_gc_pressure; +typedef void (*jl_gc_cb_notify_gc_pressure_t)(void); #define gc_invoke_callbacks(ty, list, args) \ do { \ @@ -128,6 +130,14 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_external_free(jl_gc_cb_notify_external_fre jl_gc_deregister_callback(&gc_cblist_notify_external_free, (jl_gc_cb_func_t)cb); } +JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t cb, int enable) +{ + if (enable) + jl_gc_register_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb); + else + jl_gc_deregister_callback(&gc_cblist_notify_gc_pressure, (jl_gc_cb_func_t)cb); +} + // Protect all access to `finalizer_list_marked` and `to_finalize`. // For accessing `ptls->finalizers`, the lock is needed if a thread // is going to realloc the buffer (of its own list) or accessing the @@ -739,6 +749,7 @@ static int mark_reset_age = 0; static int64_t scanned_bytes; // young bytes scanned while marking static int64_t perm_scanned_bytes; // old bytes scanned while marking int prev_sweep_full = 1; +int under_pressure = 0; // Full collection heuristics static int64_t live_bytes = 0; @@ -3338,6 +3349,8 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) next_sweep_full = 1; else next_sweep_full = 0; + if (heap_size > max_total_memory * 0.8 || thrashing) + under_pressure = 1; // sweeping is over // 7. if it is a quick sweep, put back the remembered objects in queued state // so that we don't trigger the barrier again on them. @@ -3487,6 +3500,10 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) gc_invoke_callbacks(jl_gc_cb_post_gc_t, gc_cblist_post_gc, (collection)); + if (under_pressure) + gc_invoke_callbacks(jl_gc_cb_notify_gc_pressure_t, + gc_cblist_notify_gc_pressure, ()); + under_pressure = 0; #ifdef _OS_WINDOWS_ SetLastError(last_error); #endif diff --git a/src/jl_exported_funcs.inc b/src/jl_exported_funcs.inc index c2b2a1578fd76..a7ffedd5cba10 100644 --- a/src/jl_exported_funcs.inc +++ b/src/jl_exported_funcs.inc @@ -191,6 +191,7 @@ XX(jl_gc_schedule_foreign_sweepfunc) \ XX(jl_gc_set_cb_notify_external_alloc) \ XX(jl_gc_set_cb_notify_external_free) \ + XX(jl_gc_set_cb_notify_gc_pressure) \ XX(jl_gc_set_cb_post_gc) \ XX(jl_gc_set_cb_pre_gc) \ XX(jl_gc_set_cb_root_scanner) \