GC scheduler refinements

JuliaLang · Nov 30, 2023 · 0078148 · 0078148
1 parent fd67cb2
commit 0078148
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 30 deletions.
diff --git a/src/gc.c b/src/gc.c
@@ -31,6 +31,8 @@ uv_mutex_t gc_threads_lock;
 uv_cond_t gc_threads_cond;
 // To indicate whether concurrent sweeping should run
 uv_sem_t gc_sweep_assists_needed;
+// Mutex used to coordinate entry of GC threads in the mark loop
+uv_mutex_t gc_queue_observer_lock;
 
 // Linked list of callback functions
 
@@ -2863,8 +2865,10 @@ void gc_mark_and_steal(jl_ptls_t ptls)
  jl_gc_markqueue_t *mq = &ptls->mark_queue;
  jl_gc_markqueue_t *mq_master = NULL;
  int master_tid = jl_atomic_load(&gc_master_tid);
- if (master_tid != -1)
- mq_master = &gc_all_tls_states[master_tid]->mark_queue;
+ if (master_tid == -1) {
+ return;
+ }
+ mq_master = &gc_all_tls_states[master_tid]->mark_queue;
  void *new_obj;
  jl_gc_chunk_t c;
  pop : {
@@ -2939,28 +2943,108 @@ void gc_mark_and_steal(jl_ptls_t ptls)
  }
 }
 
+size_t gc_count_work_in_queue(jl_ptls_t ptls) JL_NOTSAFEPOINT
+{
+ // assume each chunk is worth 256 units of work and each pointer
+ // is worth 1 unit of work
+ size_t work = 256 * (jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.bottom) -
+ jl_atomic_load_relaxed(&ptls->mark_queue.chunk_queue.top));
+ work += (jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.bottom) -
+ jl_atomic_load_relaxed(&ptls->mark_queue.ptr_queue.top));
+ return work;
+}
+
+/**
+ * Correctness argument for the mark-loop termination protocol.
+ *
+ * Safety properties:
+ * - No work items shall be in any thread's queues when `gc_mark_loop_barrier` observes
+ * that `gc_n_threads_marking` is zero.
+ *
+ * - No work item shall be stolen from the master thread (i.e. mutator thread which started
+ * GC and which helped the `jl_n_markthreads` - 1 threads to mark) after
+ * `gc_mark_loop_barrier` observes that `gc_n_threads_marking` is zero. This property is
+ * necessary because we call `gc_mark_loop_serial` after marking the finalizer list in
+ * `_jl_gc_collect`, and want to ensure that we have the serial mark-loop semantics there,
+ * and that no work is stolen from us at that point.
+ *
+ * Proof:
+ * - Suppose the master thread observes that `gc_n_threads_marking` is zero in
+ * `gc_mark_loop_barrier` and there is a work item left in one thread's queue at that point.
+ * Since threads try to steal from all threads' queues, this implies that all threads must
+ * have tried to steal from the queue which still has a work item left, but failed to do so,
+ * which violates the semantics of Chase-Lev's work-stealing queue.
+ *
+ * - Let E1 be the event "master thread writes -1 to gc_master_tid" and E2 be the even
+ * "master thread observes that `gc_n_threads_marking` is zero". Since we're using
+ * sequentially consistent atomics, E1 => E2. Now suppose one thread which is spinning in
+ * `gc_should_mark` tries to enter the mark-loop after E2. In order to do so, it must
+ * increment `gc_n_threads_marking` to 1 in an event E3, and then read `gc_master_tid` in an
+ * event E4. Since we're using sequentially consistent atomics, E3 => E4. Since we observed
+ * `gc_n_threads_marking` as zero in E2, then E2 => E3, and we conclude E1 => E4, so that
+ * the thread which is spinning in `gc_should_mark` must observe that `gc_master_tid` is -1
+ * and therefore won't enter the mark-loop.
+ */
+
+int gc_should_mark(jl_ptls_t ptls)
+{
+ int should_mark = 0;
+ int n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
+ // fast path
+ if (n_threads_marking == 0) {
+ return 0;
+ }
+ uv_mutex_lock(&gc_queue_observer_lock);
+ while (1) {
+ int tid = jl_atomic_load(&gc_master_tid);
+ // fast path
+ if (tid == -1) {
+ break;
+ }
+ n_threads_marking = jl_atomic_load(&gc_n_threads_marking);
+ // fast path
+ if (n_threads_marking == 0) {
+ break;
+ }
+ size_t work = gc_count_work_in_queue(gc_all_tls_states[tid]);
+ for (tid = gc_first_tid; tid < gc_first_tid + jl_n_markthreads; tid++) {
+ work += gc_count_work_in_queue(gc_all_tls_states[tid]);
+ }
+ // if there is a lot of work left, enter the mark loop
+ if (work > 16 * n_threads_marking) {
+ jl_atomic_fetch_add(&gc_n_threads_marking, 1);
+ should_mark = 1;
+ break;
+ }
+ jl_cpu_pause();
+ }
+ uv_mutex_unlock(&gc_queue_observer_lock);
+ return should_mark;
+}
+
+void gc_wake_all_for_marking(jl_ptls_t ptls)
+{
+ jl_atomic_store(&gc_master_tid, ptls->tid);
+ uv_mutex_lock(&gc_threads_lock);
+ jl_atomic_fetch_add(&gc_n_threads_marking, 1);
+ uv_cond_broadcast(&gc_threads_cond);
+ uv_mutex_unlock(&gc_threads_lock);
+}
+
 void gc_mark_loop_parallel(jl_ptls_t ptls, int master)
 {
- int backoff = GC_BACKOFF_MIN;
  if (master) {
- jl_atomic_store(&gc_master_tid, ptls->tid);
- // Wake threads up and try to do some work
- uv_mutex_lock(&gc_threads_lock);
- jl_atomic_fetch_add(&gc_n_threads_marking, 1);
- uv_cond_broadcast(&gc_threads_cond);
- uv_mutex_unlock(&gc_threads_lock);
+ gc_wake_all_for_marking(ptls);
  gc_mark_and_steal(ptls);
  jl_atomic_fetch_add(&gc_n_threads_marking, -1);
  }
- while (jl_atomic_load(&gc_n_threads_marking) > 0) {
- // Try to become a thief while other threads are marking
- jl_atomic_fetch_add(&gc_n_threads_marking, 1);
- if (jl_atomic_load(&gc_master_tid) != -1) {
- gc_mark_and_steal(ptls);
+ while (1) {
+ int should_mark = gc_should_mark(ptls);
+ if (!should_mark) {
+ break;
  }
+ gc_mark_and_steal(ptls);
  jl_atomic_fetch_add(&gc_n_threads_marking, -1);
- // Failed to steal
- gc_backoff(&backoff);
  }
 }
 
@@ -3728,6 +3812,7 @@ void jl_gc_init(void)
  uv_mutex_init(&gc_threads_lock);
  uv_cond_init(&gc_threads_cond);
  uv_sem_init(&gc_sweep_assists_needed, 0);
+ uv_mutex_init(&gc_queue_observer_lock);
 
  jl_gc_init_page();
  jl_gc_debug_init();

diff --git a/src/gc.h b/src/gc.h
@@ -190,25 +190,12 @@ extern jl_gc_page_stack_t global_page_pool_lazily_freed;
 extern jl_gc_page_stack_t global_page_pool_clean;
 extern jl_gc_page_stack_t global_page_pool_freed;
 
-#define GC_BACKOFF_MIN 4
-#define GC_BACKOFF_MAX 12
-
-STATIC_INLINE void gc_backoff(int *i) JL_NOTSAFEPOINT
-{
- if (*i < GC_BACKOFF_MAX) {
- (*i)++;
- }
- for (int j = 0; j < (1 << *i); j++) {
- jl_cpu_pause();
- }
-}
-
 // Lock-free stack implementation taken
 // from Herlihy's "The Art of Multiprocessor Programming"
 // XXX: this is not a general-purpose lock-free stack. We can
 // get away with just using a CAS and not implementing some ABA
 // prevention mechanism since once a node is popped from the
-// `jl_gc_global_page_pool_t`, it may only be pushed back to them
+// `jl_gc_page_stack_t`, it may only be pushed back to them
 // in the sweeping phase, which also doesn't push a node into the
 // same stack after it's popped