Skip to content

Commit

Permalink
Tracker: always look at topology on process conflict
Browse files Browse the repository at this point in the history
When there are two conflicting processes, ignore the clocks and always
look at the current node topology to resolve the conflict.

Fixes the integration test which sometimes did not pass.
  • Loading branch information
Arjan Scherpenisse committed Jul 23, 2018
1 parent eb89e5c commit e793c78
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 59 deletions.
4 changes: 2 additions & 2 deletions config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ config :swarm,
nodes: [:"[email protected]", :"[email protected]"],
sync_nodes_timeout: 0,
anti_entropy_interval: 5_000,
debug: false,
debug: true,
node_blacklist: [
# the following blacklists nodes set up by exrm/relx/distillery
# for remote shells (the first) and hot upgrade scripting (the second)
Expand All @@ -20,7 +20,7 @@ config :swarm,
]

config :logger,
level: :warn
level: :debug

config :porcelain,
goon_warn_if_missing: false
99 changes: 44 additions & 55 deletions lib/swarm/tracker/tracker.ex
Original file line number Diff line number Diff line change
Expand Up @@ -480,83 +480,72 @@ defmodule Swarm.Tracker do

state

entry(pid: lpid, clock: lclock) = lreg ->
# there are two different processes for the same name, we need to resolve
case Clock.compare(lclock, rclock) do
:lt ->
# the remote registration dominates
entry(pid: lpid, clock: _lclock) = lreg ->
# there are two different processes for the same name, we
# need to resolve determine which process is correct based
# on current topology and resolve the conflict

rpid_node = node(rpid)
lpid_node = node(lpid)

case Strategy.key_to_node(state.strategy, rname) do
^rpid_node when lpid_node != rpid_node ->
debug(
"remote and local view of #{inspect(rname)} conflict, but remote is correct, resolving.."
)

resolve_incorrect_local_reg(sync_node, lreg, rreg, state)

:gt ->
# local registration dominates
debug("remote view of #{inspect(rname)} is outdated, resolving..")
^lpid_node when lpid_node != rpid_node ->
debug(
"remote and local view of #{inspect(rname)} conflict, but local is correct, resolving.."
)

resolve_incorrect_remote_reg(sync_node, lreg, rreg, state)

_ ->
# the entry clocks conflict, determine which one is correct based on
# current topology and resolve the conflict
rpid_node = node(rpid)
lpid_node = node(lpid)

case Strategy.key_to_node(state.strategy, rname) do
^rpid_node when lpid_node != rpid_node ->
cond do
lpid_node == rpid_node and lpid > rpid ->
debug(
"remote and local view of #{inspect(rname)} conflict, but remote is correct, resolving.."
"remote and local view of #{inspect(rname)} conflict, but local is more recent, resolving.."
)

resolve_incorrect_local_reg(sync_node, lreg, rreg, state)
resolve_incorrect_remote_reg(sync_node, lreg, rreg, state)

^lpid_node when lpid_node != rpid_node ->
lpid_node == rpid_node and lpid < rpid ->
debug(
"remote and local view of #{inspect(rname)} conflict, but local is correct, resolving.."
"remote and local view of #{inspect(rname)} conflict, but remote is more recent, resolving.."
)

resolve_incorrect_remote_reg(sync_node, lreg, rreg, state)
resolve_incorrect_local_reg(sync_node, lreg, rreg, state)

_ ->
cond do
lpid_node == rpid_node and lpid > rpid ->
:else ->
# name should be on another node, so neither registration is correct, break tie
# using registry clock instead
case Clock.compare(clock, sync_clock) do
:lt ->
# remote dominates
resolve_incorrect_local_reg(sync_node, lreg, rreg, state)

:gt ->
# local dominates
debug(
"remote and local view of #{inspect(rname)} conflict, but local is more recent, resolving.."
"remote view of #{inspect(rname)} is outdated based on registry clock, resolving.."
)

resolve_incorrect_remote_reg(sync_node, lreg, rreg, state)

lpid_node == rpid_node and lpid < rpid ->
_ when lpid_node > rpid_node ->
# break tie using node priority
debug(
"remote and local view of #{inspect(rname)} conflict, but remote is more recent, resolving.."
"remote view of #{inspect(rname)} is outdated based on node priority, resolving.."
)

resolve_incorrect_local_reg(sync_node, lreg, rreg, state)
resolve_incorrect_remote_reg(sync_node, lreg, rreg, state)

:else ->
# name should be on another node, so neither registration is correct, break tie
# using registry clock instead
case Clock.compare(clock, sync_clock) do
:lt ->
# remote dominates
resolve_incorrect_local_reg(sync_node, lreg, rreg, state)

:gt ->
# local dominates
debug(
"remote view of #{inspect(rname)} is outdated based on registry clock, resolving.."
)

resolve_incorrect_remote_reg(sync_node, lreg, rreg, state)

_ when lpid_node > rpid_node ->
# break tie using node priority
debug(
"remote view of #{inspect(rname)} is outdated based on node priority, resolving.."
)

resolve_incorrect_remote_reg(sync_node, lreg, rreg, state)

_ ->
# break tie using node priority
resolve_incorrect_local_reg(sync_node, lreg, rreg, state)
end
_ ->
# break tie using node priority
resolve_incorrect_local_reg(sync_node, lreg, rreg, state)
end
end
end
Expand Down
7 changes: 5 additions & 2 deletions test/integration_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ defmodule Swarm.IntegrationTest do
{_, {:ok, _}} = spawn_worker(@node1, {:worker, n}, group_name)
end

# wait for process registration
Process.sleep(1000)

node1_registry = get_registry(@node1)
node2_registry = get_registry(@node2)

Expand All @@ -42,7 +45,7 @@ defmodule Swarm.IntegrationTest do
disconnect(@node1, from: @node2)

# wait for process redistribution
Process.sleep(@worker_count)
Process.sleep(1000)

## check to see if the processes were migrated as expected
assert length(workers_for(@node1)) == @worker_count
Expand All @@ -55,7 +58,7 @@ defmodule Swarm.IntegrationTest do
connect(@node1, to: @node2)

# give time to sync
Process.sleep(1_000)
Process.sleep(1000)

# make sure processes are back in the correct place
assert length(workers_for(@node1)) < @worker_count
Expand Down

0 comments on commit e793c78

Please sign in to comment.