Skip to content

Commit

Permalink
Revert "[Distributed] make finalizer messages threadsafe (JuliaLang#4…
Browse files Browse the repository at this point in the history
…2240)"

This reverts commit eb1d6b3.
  • Loading branch information
tkf committed Sep 26, 2021
1 parent 37b7a33 commit bdc1e6c
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 75 deletions.
11 changes: 3 additions & 8 deletions stdlib/Distributed/src/cluster.jl
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,9 @@ end
@enum WorkerState W_CREATED W_CONNECTED W_TERMINATING W_TERMINATED
mutable struct Worker
id::Int
msg_lock::Threads.ReentrantLock # Lock for del_msgs, add_msgs, and gcflag
del_msgs::Array{Any,1} # XXX: Could del_msgs and add_msgs be Channels?
del_msgs::Array{Any,1}
add_msgs::Array{Any,1}
@atomic gcflag::Bool
gcflag::Bool
state::WorkerState
c_state::Condition # wait for state changes
ct_time::Float64 # creation time
Expand Down Expand Up @@ -134,7 +133,7 @@ mutable struct Worker
if haskey(map_pid_wrkr, id)
return map_pid_wrkr[id]
end
w=new(id, Threads.ReentrantLock(), [], [], false, W_CREATED, Condition(), time(), conn_func)
w=new(id, [], [], false, W_CREATED, Condition(), time(), conn_func)
w.initialized = Event()
register_worker(w)
w
Expand Down Expand Up @@ -472,10 +471,6 @@ function addprocs_locked(manager::ClusterManager; kwargs...)
# The `launch` method should add an object of type WorkerConfig for every
# worker launched. It provides information required on how to connect
# to it.

# FIXME: launched should be a Channel, launch_ntfy should be a Threads.Condition
# but both are part of the public interface. This means we currently can't use
# `Threads.@spawn` in the code below.
launched = WorkerConfig[]
launch_ntfy = Condition()

Expand Down
39 changes: 16 additions & 23 deletions stdlib/Distributed/src/messages.jl
Original file line number Diff line number Diff line change
Expand Up @@ -126,30 +126,23 @@ function flush_gc_msgs(w::Worker)
if !isdefined(w, :w_stream)
return
end
add_msgs = nothing
del_msgs = nothing
@lock w.msg_lock begin
if !w.gcflag # No work needed for this worker
return
end
@atomic w.gcflag = false
if !isempty(w.add_msgs)
add_msgs = w.add_msgs
w.add_msgs = Any[]
end

if !isempty(w.del_msgs)
del_msgs = w.del_msgs
w.del_msgs = Any[]
end
end
if add_msgs !== nothing
remote_do(add_clients, w, add_msgs)
w.gcflag = false
new_array = Any[]
msgs = w.add_msgs
w.add_msgs = new_array
if !isempty(msgs)
remote_do(add_clients, w, msgs)
end
if del_msgs !== nothing
remote_do(del_clients, w, del_msgs)

# del_msgs gets populated by finalizers, so be very careful here about ordering of allocations
# XXX: threading requires this to be atomic
new_array = Any[]
msgs = w.del_msgs
w.del_msgs = new_array
if !isempty(msgs)
#print("sending delete of $msgs\n")
remote_do(del_clients, w, msgs)
end
return
end

# Boundary inserted between messages on the wire, used for recovering
Expand Down Expand Up @@ -194,7 +187,7 @@ end
function flush_gc_msgs()
try
for w in (PGRP::ProcessGroup).workers
if isa(w,Worker) && (w.state == W_CONNECTED) && w.gcflag
if isa(w,Worker) && w.gcflag && (w.state == W_CONNECTED)
flush_gc_msgs(w)
end
end
Expand Down
55 changes: 11 additions & 44 deletions stdlib/Distributed/src/remotecall.jl
Original file line number Diff line number Diff line change
Expand Up @@ -256,27 +256,14 @@ function del_clients(pairs::Vector)
end
end

# The task below is coalescing the `flush_gc_msgs` call
# across multiple producers, see `send_del_client`,
# and `send_add_client`.
# XXX: Is this worth the additional complexity?
# `flush_gc_msgs` has to iterate over all connected workers.
const any_gc_flag = Threads.Condition()
const any_gc_flag = Condition()
function start_gc_msgs_task()
errormonitor(
Threads.@spawn begin
while true
lock(any_gc_flag) do
# this might miss events
wait(any_gc_flag)
end
flush_gc_msgs() # handles throws internally
end
end
)
errormonitor(@async while true
wait(any_gc_flag)
flush_gc_msgs()
end)
end

# Function can be called within a finalizer
function send_del_client(rr)
if rr.where == myid()
del_client(rr)
Expand All @@ -294,27 +281,11 @@ function send_del_client_no_lock(rr)
end
end

function publish_del_msg!(w::Worker, msg)
lock(w.msg_lock) do
push!(w.del_msgs, msg)
@atomic w.gcflag = true
end
lock(any_gc_flag) do
notify(any_gc_flag)
end
end

function process_worker(rr)
w = worker_from_id(rr.where)::Worker
msg = (remoteref_id(rr), myid())

# Needs to aquire a lock on the del_msg queue
T = Threads.@spawn begin
publish_del_msg!($w, $msg)
end
Base.errormonitor(T)

return
push!(w.del_msgs, (remoteref_id(rr), myid()))
w.gcflag = true
notify(any_gc_flag)
end

function add_client(id, client)
Expand All @@ -339,13 +310,9 @@ function send_add_client(rr::AbstractRemoteRef, i)
# to the processor that owns the remote ref. it will add_client
# itself inside deserialize().
w = worker_from_id(rr.where)
lock(w.msg_lock) do
push!(w.add_msgs, (remoteref_id(rr), i))
@atomic w.gcflag = true
end
lock(any_gc_flag) do
notify(any_gc_flag)
end
push!(w.add_msgs, (remoteref_id(rr), i))
w.gcflag = true
notify(any_gc_flag)
end
end

Expand Down

0 comments on commit bdc1e6c

Please sign in to comment.