[Distributed] kill(::LocalManager, ...) should actually call kill()

When dealing with a local process, if we want to remove a process, we can try a little harder than simply calling `remote_do(exit, id)`. We can actually `kill()` the process by sending `SIGTERM`, then `SIGKILL`. Because we use `Distributed` to run our Base test workers, this can provide a more certain method of closing our workers at the end of test sets, as well as a better way of killing processes such that they dump core in the event that they do get stuck.
pcjentsch · Jun 24, 2022 · fd1c21b · fd1c21b
1 parent a375fe8
commit fd1c21b
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 0 deletions.
diff --git a/stdlib/Distributed/src/managers.jl b/stdlib/Distributed/src/managers.jl
@@ -725,3 +725,26 @@ function kill(manager::SSHManager, pid::Int, config::WorkerConfig)
  cancel_ssh_tunnel(config)
  nothing
 end
+
+function kill(manager::LocalManager, pid::Int, config::WorkerConfig; exit_timeout = 15, term_timeout = 15)
+ # First, try sending `exit()` to the remote over the usual control channels
+ remote_do(exit, pid)
+
+ timer_task = @async begin
+ sleep(exit_timeout)
+
+ # Check to see if our child exited, and if not, send an actual kill signal
+ if !process_exited(config.process)
+ @warn("Failed to gracefully kill worker $(pid), sending SIGTERM")
+ kill(config.process, Base.SIGTERM)
+
+ sleep(term_timeout)
+ if !process_exited(config.process)
+ @warn("Worker $(pid) ignored SIGTERM, sending SIGKILL")
+ kill(config.process, Base.SIGKILL)
+ end
+ end
+ end
+ errormonitor(timer_task)
+ return nothing
+end
diff --git a/stdlib/Distributed/test/distributed_exec.jl b/stdlib/Distributed/test/distributed_exec.jl
@@ -1856,6 +1856,26 @@ end end
 
 include("splitrange.jl")
 
+# Clear all workers for timeout tests (issue #45785)
+rmprocs(workers())
+begin
+ # First, assert that we get no messages when we close a cooperative worker
+ w = only(addprocs(1))
+ @test_nowarn begin
+ wait(rmprocs([w]))
+ end
+
+ # Next, ensure we get a log message when a worker does not cleanly exit
+ w = only(addprocs(1))
+ @test_logs (:warn, r"sending SIGTERM") begin
+ remote_do(w) do
+ # Cause the 'exit()' message that `rmprocs()` sends to do nothing
+ Core.eval(Base, :(exit() = nothing))
+ end
+ wait(rmprocs([w]))
+ end
+end
+
 # Run topology tests last after removing all workers, since a given
 # cluster at any time only supports a single topology.
 rmprocs(workers())