Skip to content

Commit

Permalink
A few fixes to make the marking more parallel friendly
Browse files Browse the repository at this point in the history
* Use normal fields instead of bitfields in page metadata.

    So that the fields can be modified concurrently.

* Use atomic operation instead of volatile for thread synchronization.

* Use safepoint instead of state transitions for the join.

    So that it's easier to branch into the GC.

* Only access global data in setmark if the tag is changed.

* Pass `ptls` to `gc_managed_realloc_`.

* Make `_jl_gc_collect` non-recursive.
  • Loading branch information
yuyichao committed Jan 25, 2017
1 parent f1f2c0a commit 0c19af2
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 67 deletions.
64 changes: 35 additions & 29 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -458,29 +458,30 @@ static inline uint16_t gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o,
assert(find_region(o) == NULL);
bigval_t *hdr = bigval_header(o);
if (mark_reset_age) {
// Reset the object as if it was just allocated
hdr->age = 0;
gc_big_object_unlink(hdr);
gc_big_object_link(hdr, &ptls->heap.big_objects);
mark_mode = GC_MARKED;
}
else {
if (gc_old(tag))
mark_mode = GC_OLD_MARKED;
if (mark_mode == GC_OLD_MARKED) {
// Move hdr from big_objects list to big_objects_marked list
gc_big_object_unlink(hdr);
gc_big_object_link(hdr, &big_objects_marked);
}
else if (gc_old(tag)) {
mark_mode = GC_OLD_MARKED;
}
tag = gc_set_bits(tag, mark_mode);
tag = jl_atomic_exchange_relaxed(&o->header, tag);
uint16_t tag_changed = !gc_marked(tag);
if (tag_changed) {
if (mark_mode == GC_OLD_MARKED)
if (mark_mode == GC_OLD_MARKED) {
perm_scanned_bytes += hdr->sz & ~3;
else
// Move hdr from big_objects list to big_objects_marked list
gc_big_object_unlink(hdr);
gc_big_object_link(hdr, &big_objects_marked);
}
else {
scanned_bytes += hdr->sz & ~3;
if (mark_reset_age) {
// Reset the object as if it was just allocated
hdr->age = 0;
gc_big_object_unlink(hdr);
gc_big_object_link(hdr, &ptls->heap.big_objects);
}
}
objprofile_count(jl_typeof(jl_valueof(o)),
mark_mode == GC_OLD_MARKED, hdr->sz&~3);
}
Expand All @@ -505,11 +506,6 @@ static inline uint16_t gc_setmark_pool_(jl_ptls_t ptls, jl_taggedvalue_t *o,
if (mark_reset_age) {
// Reset the object as if it was just allocated
mark_mode = GC_MARKED;
page->has_young = 1;
char *page_begin = gc_page_data(o) + GC_PAGE_OFFSET;
int obj_id = (((char*)o) - page_begin) / page->osize;
uint8_t *ages = page->ages + obj_id / 8;
*ages &= ~(1 << (obj_id % 8));
}
else if (gc_old(tag)) {
mark_mode = GC_OLD_MARKED;
Expand All @@ -524,12 +520,19 @@ static inline uint16_t gc_setmark_pool_(jl_ptls_t ptls, jl_taggedvalue_t *o,
}
else {
scanned_bytes += page->osize;
if (mark_reset_age) {
page->has_young = 1;
char *page_begin = gc_page_data(o) + GC_PAGE_OFFSET;
int obj_id = (((char*)o) - page_begin) / page->osize;
uint8_t *ages = page->ages + obj_id / 8;
*ages &= ~(1 << (obj_id % 8));
}
}
objprofile_count(jl_typeof(jl_valueof(o)),
mark_mode == GC_OLD_MARKED, page->osize);
page->has_marked = 1;
}
assert(gc_marked(mark_mode));
page->has_marked = 1;
verify_val(jl_valueof(o));
return (tag_changed << 8) | mark_mode;
}
Expand Down Expand Up @@ -1786,7 +1789,7 @@ static void jl_gc_mark_ptrfree(jl_ptls_t ptls)
}

// Only one thread should be running in this function
static void _jl_gc_collect(jl_ptls_t ptls, int full)
static int _jl_gc_collect(jl_ptls_t ptls, int full)
{
uint64_t t0 = jl_hrtime();
int64_t last_perm_scanned_bytes = perm_scanned_bytes;
Expand Down Expand Up @@ -1935,9 +1938,7 @@ static void _jl_gc_collect(jl_ptls_t ptls, int full)
gc_num.since_sweep = 0;
gc_num.freed = 0;

if (recollect) {
_jl_gc_collect(ptls, 0);
}
return recollect;
}

JL_DLLEXPORT void jl_gc_collect(int full)
Expand Down Expand Up @@ -1965,7 +1966,11 @@ JL_DLLEXPORT void jl_gc_collect(int full)

if (!jl_gc_disable_counter) {
JL_LOCK_NOGC(&finalizers_lock);
_jl_gc_collect(ptls, full);
if (_jl_gc_collect(ptls, full)) {
int ret = _jl_gc_collect(ptls, 0);
(void)ret;
assert(!ret);
}
JL_UNLOCK_NOGC(&finalizers_lock);
}

Expand Down Expand Up @@ -2152,11 +2157,11 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
return b;
}

static void *gc_managed_realloc_(void *d, size_t sz, size_t oldsz,
static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t oldsz,
int isaligned, jl_value_t *owner, int8_t can_collect)
{
if (can_collect)
maybe_collect(jl_get_ptls_states());
maybe_collect(ptls);

size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
if (allocsz < sz) // overflow in adding offs, size was "negative"
Expand Down Expand Up @@ -2186,7 +2191,8 @@ static void *gc_managed_realloc_(void *d, size_t sz, size_t oldsz,
JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
int isaligned, jl_value_t *owner)
{
return gc_managed_realloc_(d, sz, oldsz, isaligned, owner, 1);
jl_ptls_t ptls = jl_get_ptls_states();
return gc_managed_realloc_(ptls, d, sz, oldsz, isaligned, owner, 1);
}

jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
Expand Down Expand Up @@ -2217,7 +2223,7 @@ jl_value_t *jl_gc_realloc_string(jl_value_t *s, size_t sz)
// for now it's up to the caller to make sure there are no references to the
// old pointer.
bigval_t *newbig =
(bigval_t*)gc_managed_realloc_(hdr, allocsz, LLT_ALIGN(strsz+offs, JL_CACHE_BYTE_ALIGNMENT),
(bigval_t*)gc_managed_realloc_(ptls, hdr, allocsz, LLT_ALIGN(strsz+offs, JL_CACHE_BYTE_ALIGNMENT),
1, s, 0);
newbig->sz = allocsz;
newbig->age = 0;
Expand Down
40 changes: 19 additions & 21 deletions src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,27 +113,25 @@ typedef struct _mallocarray_t {

// pool page metadata
typedef struct {
struct {
// index of pool that owns this page
uint16_t pool_n : 8;
// Whether any cell in the page is marked
// This bit is set before sweeping iff there's live cells in the page.
// Note that before marking or after sweeping there can be live
// (and young) cells in the page for `!has_marked`.
uint16_t has_marked: 1;
// Whether any cell was live and young **before sweeping**.
// For a normal sweep (quick sweep that is NOT preceded by a
// full sweep) this bit is set iff there are young or newly dead
// objects in the page and the page needs to be swept.
//
// For a full sweep, this bit should be ignored.
//
// For a quick sweep preceded by a full sweep. If this bit is set,
// the page needs to be swept. If this bit is not set, there could
// still be old dead objects in the page and `nold` and `prev_nold`
// should be used to determine if the page needs to be swept.
uint16_t has_young: 1;
};
// index of pool that owns this page
uint8_t pool_n;
// Whether any cell in the page is marked
// This bit is set before sweeping iff there's live cells in the page.
// Note that before marking or after sweeping there can be live
// (and young) cells in the page for `!has_marked`.
uint8_t has_marked;
// Whether any cell was live and young **before sweeping**.
// For a normal sweep (quick sweep that is NOT preceded by a
// full sweep) this bit is set iff there are young or newly dead
// objects in the page and the page needs to be swept.
//
// For a full sweep, this bit should be ignored.
//
// For a quick sweep preceded by a full sweep. If this bit is set,
// the page needs to be swept. If this bit is not set, there could
// still be old dead objects in the page and `nold` and `prev_nold`
// should be used to determine if the page needs to be swept.
uint8_t has_young;
// number of old objects in this page
uint16_t nold;
// number of old objects in this page during the previous full sweep
Expand Down
29 changes: 16 additions & 13 deletions src/threadgroup.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ int ti_threadgroup_create(uint8_t num_sockets, uint8_t num_cores,
jl_malloc_aligned(num_threads * sizeof(ti_thread_sense_t*), 64);
for (i = 0; i < num_threads; i++)
tg->thread_sense[i] = NULL;
tg->group_sense = 0;
jl_atomic_store_release(&tg->group_sense, 0);

uv_mutex_init(&tg->alarm_lock);
uv_cond_init(&tg->alarm);
Expand Down Expand Up @@ -118,10 +118,13 @@ int ti_threadgroup_size(ti_threadgroup_t *tg, int16_t *tgsize)

int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid, void **bcast_val)
{
if (tg->tid_map[ext_tid] == 0) {
uint8_t *group_sense = &tg->group_sense;
int16_t tid = tg->tid_map[ext_tid];
int thread_sense = tg->thread_sense[tid]->sense;
if (tid == 0) {
tg->envelope = bcast_val ? *bcast_val : NULL;
// synchronize `tg->envelope` and `tg->group_sense`
jl_atomic_store_release(&tg->group_sense, tg->thread_sense[0]->sense);
jl_atomic_store_release(group_sense, thread_sense);

// if it's possible that threads are sleeping, signal them
if (tg->sleep_threshold) {
Expand All @@ -135,8 +138,7 @@ int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid, void **bcast_val)
uint64_t spin_ns;
uint64_t spin_start = 0;
// synchronize `tg->envelope` and `tg->group_sense`
while (jl_atomic_load_acquire(&tg->group_sense) !=
tg->thread_sense[tg->tid_map[ext_tid]]->sense) {
while (jl_atomic_load_acquire(group_sense) != thread_sense) {
if (tg->sleep_threshold) {
if (!spin_start) {
// Lazily initialize spin_start since uv_hrtime is expensive
Expand All @@ -147,8 +149,7 @@ int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid, void **bcast_val)
// In case uv_hrtime is not monotonic, we'll sleep earlier
if (spin_ns >= tg->sleep_threshold) {
uv_mutex_lock(&tg->alarm_lock);
if (tg->group_sense !=
tg->thread_sense[tg->tid_map[ext_tid]]->sense) {
if (jl_atomic_load_acquire(group_sense) != thread_sense) {
uv_cond_wait(&tg->alarm, &tg->alarm_lock);
}
uv_mutex_unlock(&tg->alarm_lock);
Expand All @@ -167,14 +168,16 @@ int ti_threadgroup_fork(ti_threadgroup_t *tg, int16_t ext_tid, void **bcast_val)

int ti_threadgroup_join(ti_threadgroup_t *tg, int16_t ext_tid)
{
int i;

tg->thread_sense[tg->tid_map[ext_tid]]->sense
= !tg->thread_sense[tg->tid_map[ext_tid]]->sense;
int *p_thread_sense = &tg->thread_sense[tg->tid_map[ext_tid]]->sense;
jl_atomic_store_release(p_thread_sense, !*p_thread_sense);
if (tg->tid_map[ext_tid] == 0) {
for (i = 1; i < tg->num_threads; ++i) {
while (tg->thread_sense[i]->sense == tg->group_sense)
jl_ptls_t ptls = jl_get_ptls_states();
int8_t group_sense = tg->group_sense;
for (int i = 1; i < tg->num_threads; ++i) {
while (jl_atomic_load_acquire(&tg->thread_sense[i]->sense) == group_sense) {
jl_gc_safepoint_(ptls);
jl_cpu_pause();
}
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/threadgroup.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

// for the barrier
typedef struct {
volatile int sense;
int sense;
} ti_thread_sense_t;

// thread group
Expand All @@ -17,7 +17,7 @@ typedef struct {
uint8_t num_sockets, num_cores, num_threads_per_core;

// fork/join/barrier
volatile uint8_t group_sense;
uint8_t group_sense; // Written only be master thread
ti_thread_sense_t **thread_sense;
void *envelope;

Expand Down
2 changes: 0 additions & 2 deletions src/threading.c
Original file line number Diff line number Diff line change
Expand Up @@ -693,10 +693,8 @@ JL_DLLEXPORT jl_value_t *jl_threading_run(jl_svec_t *args)
user_ns[ptls->tid] += (trun - tfork);
#endif

jl_gc_state_set(ptls, JL_GC_STATE_SAFE, 0);
// wait for completion (TODO: nowait?)
ti_threadgroup_join(tgworld, ptls->tid);
jl_gc_state_set(ptls, 0, JL_GC_STATE_SAFE);

#if PROFILE_JL_THREADING
uint64_t tjoin = uv_hrtime();
Expand Down

0 comments on commit 0c19af2

Please sign in to comment.