From f5e5a73d4afcbef9e26906c7ad48047666fe16f8 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel@gmail.com>
Date: Fri, 4 Aug 2023 21:29:42 -0300
Subject: [PATCH] Add fallback if we have make a weird GC decision. (#50682)

If something odd happens during GC (the PC goes to sleep) or a very big
transient the heuristics might make a bad decision. What this PR
implements is if we try to make our target more than double the one we
had before we fallback to a more conservative method. This fixes the new
issue @vtjnash found in https://github.com/JuliaLang/julia/issues/40644
for me.
---
 Make.inc            |  6 ++++
 src/gc-debug.c      |  6 ++--
 src/gc.c            | 81 +++++++++++++++++++++++++++++++++++----------
 test/cmdlineargs.jl |  4 +--
 test/testenv.jl     |  7 ++++
 5 files changed, 82 insertions(+), 22 deletions(-)

diff --git a/Make.inc b/Make.inc
index def5f40f97926..a772b5910069c 100644
--- a/Make.inc
+++ b/Make.inc
@@ -1461,6 +1461,12 @@ endef
 # Overridable in Make.user
 WINE ?= wine
 
+ifeq ($(BINARY),32)
+HEAPLIM := --heap-size-hint=1000M
+else
+HEAPLIM :=
+endif
+
 # many of the following targets must be = not := because the expansion of the makefile functions (and $1) shouldn't happen until later
 ifeq ($(BUILD_OS), WINNT) # MSYS
 spawn = $(1)
diff --git a/src/gc-debug.c b/src/gc-debug.c
index c68abf7e40b8e..b7be75aed4cb9 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1410,17 +1410,17 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
     if (!gc_logging_enabled) {
         return;
     }
-    jl_safe_printf("GC: pause %.2fms. collected %fMB. %s %s\n",
+    jl_safe_printf("\nGC: pause %.2fms. collected %fMB. %s %s\n",
         pause/1e6, freed/(double)(1<<20),
         full ? "full" : "incr",
         recollect ? "recollect" : ""
     );
 
-    jl_safe_printf("Heap stats: bytes_mapped %.2f MB, heap_size %.2f MB, heap_target %.2f MB, live_bytes %.2f MB\n, Fragmentation %.3f",
+    jl_safe_printf("Heap stats: bytes_mapped %.2f MB,\nheap_size %.2f MB, heap_target %.2f MB, Fragmentation %.3f\n",
         jl_atomic_load_relaxed(&gc_heap_stats.bytes_mapped)/(double)(1<<20),
+        // live_bytes/(double)(1<<20), live byes tracking is not accurate.
         jl_atomic_load_relaxed(&gc_heap_stats.heap_size)/(double)(1<<20),
         jl_atomic_load_relaxed(&gc_heap_stats.heap_target)/(double)(1<<20),
-        live_bytes/(double)(1<<20),
         (double)live_bytes/(double)jl_atomic_load_relaxed(&gc_heap_stats.heap_size)
     );
 }
diff --git a/src/gc.c b/src/gc.c
index 69537c529da60..a8a6a9cd3c742 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -1,6 +1,7 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
 #include "gc.h"
+#include "julia.h"
 #include "julia_gcext.h"
 #include "julia_assert.h"
 #ifdef __GLIBC__
@@ -804,8 +805,8 @@ static uint64_t old_heap_size = 0;
 static uint64_t old_alloc_diff = 0;
 static uint64_t old_freed_diff = 0;
 static uint64_t gc_end_time = 0;
-
-
+static int thrash_counter = 0;
+static int thrashing = 0;
 // global variables for GC stats
 
 // Resetting the object to a young object, this is used when marking the
@@ -1337,7 +1338,10 @@ static void combine_thread_gc_counts(jl_gc_num_t *dest) JL_NOTSAFEPOINT
             dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc);
             uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc);
             uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
             jl_atomic_store_relaxed(&gc_heap_stats.heap_size, alloc_acc - free_acc + jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
+            jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0);
+            jl_atomic_store_relaxed(&ptls->gc_num.free_acc, 0);
         }
     }
 }
@@ -3533,9 +3537,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     // If the live data outgrows the suggested max_total_memory
     // we keep going with minimum intervals and full gcs until
     // we either free some space or get an OOM error.
-    if (live_bytes > max_total_memory) {
-        sweep_full = 1;
-    }
     if (gc_sweep_always_full) {
         sweep_full = 1;
     }
@@ -3569,8 +3570,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     uint64_t sweep_time = gc_end_time - start_sweep_time;
     gc_num.total_sweep_time += sweep_time;
     gc_num.sweep_time = sweep_time;
-    
-    int thrashing = 0; // maybe we should report this to the user or error out?
+
     size_t heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size);
     double target_allocs = 0.0;
     double min_interval = default_collect_interval;
@@ -3581,24 +3581,32 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
         double collect_smooth_factor = 0.5;
         double tuning_factor = 0.03;
         double alloc_mem = jl_gc_smooth(old_alloc_diff, alloc_diff, alloc_smooth_factor);
-        double alloc_time = jl_gc_smooth(old_mut_time, mutator_time, alloc_smooth_factor);
+        double alloc_time = jl_gc_smooth(old_mut_time, mutator_time + sweep_time, alloc_smooth_factor); // Charge sweeping to the mutator
         double gc_mem = jl_gc_smooth(old_freed_diff, freed_diff, collect_smooth_factor);
-        double gc_time = jl_gc_smooth(old_pause_time, pause, collect_smooth_factor);
+        double gc_time = jl_gc_smooth(old_pause_time, pause - sweep_time, collect_smooth_factor);
         old_alloc_diff = alloc_diff;
         old_mut_time = mutator_time;
         old_freed_diff = freed_diff;
         old_pause_time = pause;
-        old_heap_size = heap_size;
-        thrashing = gc_time > mutator_time * 98 ? 1 : 0;
+        old_heap_size = heap_size; // TODO: Update these values dynamically instead of just during the GC
+        if (gc_time > alloc_time * 95 && !(thrash_counter < 4))
+            thrash_counter += 1;
+        else if (thrash_counter > 0)
+            thrash_counter -= 1;
         if (alloc_mem != 0 && alloc_time != 0 && gc_mem != 0 && gc_time != 0 ) {
             double alloc_rate = alloc_mem/alloc_time;
             double gc_rate = gc_mem/gc_time;
             target_allocs = sqrt(((double)heap_size/min_interval * alloc_rate)/(gc_rate * tuning_factor)); // work on multiples of min interval
         }
     }
-    if (target_allocs == 0.0 || thrashing) // If we are thrashing go back to default
-        target_allocs = 2*sqrt((double)heap_size/min_interval);
+    if (thrashing == 0 && thrash_counter >= 3)
+        thrashing = 1;
+    else if (thrashing == 1 && thrash_counter <= 2)
+        thrashing = 0; // maybe we should report this to the user or error out?
 
+    int bad_result = (target_allocs*min_interval + heap_size) > 2 * jl_atomic_load_relaxed(&gc_heap_stats.heap_target); // Don't follow through on a bad decision
+    if (target_allocs == 0.0 || thrashing || bad_result) // If we are thrashing go back to default
+        target_allocs = 2*sqrt((double)heap_size/min_interval);
     uint64_t target_heap = (uint64_t)target_allocs*min_interval + heap_size;
     if (target_heap > max_total_memory && !thrashing) // Allow it to go over if we are thrashing if we die we die
         target_heap = max_total_memory;
@@ -3852,10 +3860,10 @@ void jl_gc_init(void)
     total_mem = uv_get_total_memory();
     uint64_t constrained_mem = uv_get_constrained_memory();
     if (constrained_mem > 0 && constrained_mem < total_mem)
-        total_mem = constrained_mem;
+        jl_gc_set_max_memory(constrained_mem - 250*1024*1024); // LLVM + other libraries need some amount of memory
 #endif
     if (jl_options.heap_size_hint)
-        jl_gc_set_max_memory(jl_options.heap_size_hint);
+        jl_gc_set_max_memory(jl_options.heap_size_hint - 250*1024*1024);
 
     jl_gc_mark_sp_t sp = {NULL, NULL, NULL, NULL};
     gc_mark_loop(NULL, sp);
@@ -3959,7 +3967,26 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
                 jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (sz - old));
         jl_atomic_store_relaxed(&ptls->gc_num.realloc,
             jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
-        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-old);
+
+        int64_t diff = sz - old;
+        if (diff < 0) {
+            uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
+            if (free_acc + diff < 16*1024)
+                jl_atomic_store_relaxed(&ptls->gc_num.free_acc, free_acc + (-diff));
+            else {
+                jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_acc + (-diff)));
+                jl_atomic_store_relaxed(&ptls->gc_num.free_acc, 0);
+            }
+        }
+        else {
+            uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc);
+            if (alloc_acc + diff < 16*1024)
+                jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + diff);
+            else {
+                jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + diff);
+                jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0);
+            }
+        }
     }
     return realloc(p, sz);
 }
@@ -4076,7 +4103,27 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
             jl_atomic_load_relaxed(&ptls->gc_num.allocd) + (allocsz - oldsz));
     jl_atomic_store_relaxed(&ptls->gc_num.realloc,
         jl_atomic_load_relaxed(&ptls->gc_num.realloc) + 1);
-    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, allocsz-oldsz);
+
+    int64_t diff = allocsz - oldsz;
+    if (diff < 0) {
+        uint64_t free_acc = jl_atomic_load_relaxed(&ptls->gc_num.free_acc);
+        if (free_acc + diff < 16*1024)
+            jl_atomic_store_relaxed(&ptls->gc_num.free_acc, free_acc + (-diff));
+        else {
+            jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(free_acc + (-diff)));
+            jl_atomic_store_relaxed(&ptls->gc_num.free_acc, 0);
+        }
+    }
+    else {
+        uint64_t alloc_acc = jl_atomic_load_relaxed(&ptls->gc_num.alloc_acc);
+        if (alloc_acc + diff < 16*1024)
+            jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, alloc_acc + diff);
+        else {
+            jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, alloc_acc + diff);
+            jl_atomic_store_relaxed(&ptls->gc_num.alloc_acc, 0);
+        }
+    }
+
     int last_errno = errno;
 #ifdef _OS_WINDOWS_
     DWORD last_error = GetLastError();
diff --git a/test/cmdlineargs.jl b/test/cmdlineargs.jl
index 7f86534f923b3..ffe055dfbd8c9 100644
--- a/test/cmdlineargs.jl
+++ b/test/cmdlineargs.jl
@@ -858,6 +858,6 @@ end
         @test lines[3] == "foo"
         @test lines[4] == "bar"
     end
-#heap-size-hint
-@test readchomp(`$(Base.julia_cmd()) --startup-file=no --heap-size-hint=500M -e "println(@ccall jl_gc_get_max_memory()::UInt64)"`) == "524288000"
+#heap-size-hint, we reserve 250 MB for non GC memory (llvm, etc.)
+@test readchomp(`$(Base.julia_cmd()) --startup-file=no --heap-size-hint=500M -e "println(@ccall jl_gc_get_max_memory()::UInt64)"`) == "$((500-250)*1024*1024)"
 end
diff --git a/test/testenv.jl b/test/testenv.jl
index 41706dd24e75e..07b81812bb078 100644
--- a/test/testenv.jl
+++ b/test/testenv.jl
@@ -37,6 +37,13 @@ if !@isdefined(testenv_defined)
 
     function addprocs_with_testenv(X; rr_allowed=true, kwargs...)
         exename = rr_allowed ? `$rr_exename $test_exename` : test_exename
+<<<<<<< HEAD
+=======
+        if X isa Integer
+            heap_size=round(Int,(Sys.free_memory()/(1024^2)/(X+1)))
+            push!(test_exeflags.exec, "--heap-size-hint=$(heap_size)M")
+        end
+>>>>>>> ab94fadc9f (Add fallback if we have make a weird GC decision. (#50682))
         addprocs(X; exename=exename, exeflags=test_exeflags, kwargs...)
     end