Cluster singleton failure due to down-removed, see #3130
* The scenario was that previous leader left. * The problem was that the new leader got MemberRemoved before it got the HandOverDone and therefore missed the hand over data. * Solved by not changing the singleton to leader when receiving MemberRemoved and instead do that on normal HandOverDone or in failure cases after retry timeout. * The reason for this bug was the new transition from Down to Removed and that there is now no MemberDowned event. Previously this was only triggered by MemberDowned (not MemberRemoved) and that was safe because that was "always" preceeded by unreachable. * The new solution means that it will take longer for new singleton to startup in case of unreachable previous leader, but I don't want to trigger it on MemberUnreachable because it might in the future be possible to switch it back to reachable.
This commit is contained in:
parent
01bfb9378e
commit
d98a7ef1e8
3 changed files with 19 additions and 14 deletions
|
|
@ -428,7 +428,7 @@ private[cluster] final class ClusterCoreDaemon(publisher: ActorRef) extends Acto
|
||||||
|
|
||||||
latestGossip = seenVersionedGossip
|
latestGossip = seenVersionedGossip
|
||||||
|
|
||||||
log.info("Cluster Node [{}] - Marked address [{}] as LEAVING", selfAddress, address)
|
log.info("Cluster Node [{}] - Marked address [{}] as [{}]", selfAddress, address, Leaving)
|
||||||
publish(latestGossip)
|
publish(latestGossip)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -437,7 +437,7 @@ private[cluster] final class ClusterCoreDaemon(publisher: ActorRef) extends Acto
|
||||||
* State transition to EXITING.
|
* State transition to EXITING.
|
||||||
*/
|
*/
|
||||||
def exiting(address: Address): Unit = {
|
def exiting(address: Address): Unit = {
|
||||||
log.info("Cluster Node [{}] - Marked node [{}] as EXITING", selfAddress, address)
|
log.info("Cluster Node [{}] - Marked node [{}] as [{}]", selfAddress, address, Exiting)
|
||||||
// FIXME implement when we implement hand-off
|
// FIXME implement when we implement hand-off
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -475,7 +475,7 @@ private[cluster] final class ClusterCoreDaemon(publisher: ActorRef) extends Acto
|
||||||
|
|
||||||
val newMembers = downedMember match {
|
val newMembers = downedMember match {
|
||||||
case Some(m) ⇒
|
case Some(m) ⇒
|
||||||
log.info("Cluster Node [{}] - Marking node [{}] as DOWN", selfAddress, m.address)
|
log.info("Cluster Node [{}] - Marking node [{}] as [{}]", selfAddress, m.address, Down)
|
||||||
localMembers - m
|
localMembers - m
|
||||||
case None ⇒ localMembers
|
case None ⇒ localMembers
|
||||||
}
|
}
|
||||||
|
|
@ -485,7 +485,7 @@ private[cluster] final class ClusterCoreDaemon(publisher: ActorRef) extends Acto
|
||||||
localUnreachableMembers.map { member ⇒
|
localUnreachableMembers.map { member ⇒
|
||||||
// no need to DOWN members already DOWN
|
// no need to DOWN members already DOWN
|
||||||
if (member.address == address && member.status != Down) {
|
if (member.address == address && member.status != Down) {
|
||||||
log.info("Cluster Node [{}] - Marking unreachable node [{}] as DOWN", selfAddress, member.address)
|
log.info("Cluster Node [{}] - Marking unreachable node [{}] as [{}]", selfAddress, member.address, Down)
|
||||||
member copy (status = Down)
|
member copy (status = Down)
|
||||||
} else member
|
} else member
|
||||||
}
|
}
|
||||||
|
|
@ -786,25 +786,30 @@ private[cluster] final class ClusterCoreDaemon(publisher: ActorRef) extends Acto
|
||||||
// ----------------------
|
// ----------------------
|
||||||
|
|
||||||
// log the move of members from joining to up
|
// log the move of members from joining to up
|
||||||
upMembers foreach { member ⇒ log.info("Cluster Node [{}] - Leader is moving node [{}] from JOINING to UP", selfAddress, member.address) }
|
upMembers foreach { member ⇒
|
||||||
|
log.info("Cluster Node [{}] - Leader is moving node [{}] from [{}] to [{}]",
|
||||||
|
selfAddress, member.address, member.status, Up)
|
||||||
|
}
|
||||||
|
|
||||||
// tell all removed members to remove and shut down themselves
|
// tell all removed members to remove and shut down themselves
|
||||||
removedMembers foreach { member ⇒
|
removedMembers foreach { member ⇒
|
||||||
val address = member.address
|
val address = member.address
|
||||||
log.info("Cluster Node [{}] - Leader is moving node [{}] from EXITING to REMOVED - and removing node from node ring", selfAddress, address)
|
log.info("Cluster Node [{}] - Leader is moving node [{}] from [{}] to [{}] - and removing node from node ring",
|
||||||
|
selfAddress, address, member.status, Removed)
|
||||||
clusterCore(address) ! ClusterLeaderAction.Remove(address)
|
clusterCore(address) ! ClusterLeaderAction.Remove(address)
|
||||||
}
|
}
|
||||||
|
|
||||||
// tell all exiting members to exit
|
// tell all exiting members to exit
|
||||||
exitingMembers foreach { member ⇒
|
exitingMembers foreach { member ⇒
|
||||||
val address = member.address
|
val address = member.address
|
||||||
log.info("Cluster Node [{}] - Leader is moving node [{}] from LEAVING to EXITING", selfAddress, address)
|
log.info("Cluster Node [{}] - Leader is moving node [{}] from [{}] to [{}]",
|
||||||
|
selfAddress, address, member.status, Exiting)
|
||||||
clusterCore(address) ! ClusterLeaderAction.Exit(address) // FIXME should use ? to await completion of handoff?
|
clusterCore(address) ! ClusterLeaderAction.Exit(address) // FIXME should use ? to await completion of handoff?
|
||||||
}
|
}
|
||||||
|
|
||||||
// log the auto-downing of the unreachable nodes
|
// log the auto-downing of the unreachable nodes
|
||||||
unreachableButNotDownedMembers foreach { member ⇒
|
unreachableButNotDownedMembers foreach { member ⇒
|
||||||
log.info("Cluster Node [{}] - Leader is marking unreachable node [{}] as DOWN", selfAddress, member.address)
|
log.info("Cluster Node [{}] - Leader is marking unreachable node [{}] as [{}]", selfAddress, member.address, Down)
|
||||||
}
|
}
|
||||||
|
|
||||||
publish(latestGossip)
|
publish(latestGossip)
|
||||||
|
|
|
||||||
|
|
@ -422,7 +422,7 @@ class ClusterSingletonManager(
|
||||||
case Event(MemberRemoved(m), BecomingLeaderData(Some(previousLeader))) if m.address == previousLeader ⇒
|
case Event(MemberRemoved(m), BecomingLeaderData(Some(previousLeader))) if m.address == previousLeader ⇒
|
||||||
logInfo("Previous leader [{}] removed", previousLeader)
|
logInfo("Previous leader [{}] removed", previousLeader)
|
||||||
addRemoved(m.address)
|
addRemoved(m.address)
|
||||||
gotoLeader(None)
|
stay
|
||||||
|
|
||||||
case Event(TakeOverFromMe, BecomingLeaderData(None)) ⇒
|
case Event(TakeOverFromMe, BecomingLeaderData(None)) ⇒
|
||||||
sender ! HandOverToMe
|
sender ! HandOverToMe
|
||||||
|
|
@ -439,10 +439,10 @@ class ClusterSingletonManager(
|
||||||
logInfo("Retry [{}], sending HandOverToMe to [{}]", count, previousLeaderOption)
|
logInfo("Retry [{}], sending HandOverToMe to [{}]", count, previousLeaderOption)
|
||||||
previousLeaderOption foreach { peer(_) ! HandOverToMe }
|
previousLeaderOption foreach { peer(_) ! HandOverToMe }
|
||||||
setTimer(HandOverRetryTimer, HandOverRetry(count + 1), retryInterval, repeat = false)
|
setTimer(HandOverRetryTimer, HandOverRetry(count + 1), retryInterval, repeat = false)
|
||||||
} else if (previousLeaderOption.isEmpty) {
|
} else if (previousLeaderOption forall removed.contains) {
|
||||||
// can't send HandOverToMe, previousLeader unknown for new node (or restart)
|
// can't send HandOverToMe, previousLeader unknown for new node (or restart)
|
||||||
// previous leader might be down or removed, so no TakeOverFromMe message is received
|
// previous leader might be down or removed, so no TakeOverFromMe message is received
|
||||||
logInfo("Timeout in BecomingLeader. Previous leader unknown and no TakeOver request.")
|
logInfo("Timeout in BecomingLeader. Previous leader unknown, removed and no TakeOver request.")
|
||||||
gotoLeader(None)
|
gotoLeader(None)
|
||||||
} else
|
} else
|
||||||
throw new ClusterSingletonManagerIsStuck(
|
throw new ClusterSingletonManagerIsStuck(
|
||||||
|
|
|
||||||
|
|
@ -314,7 +314,7 @@ class ClusterSingletonManagerSpec extends MultiNodeSpec(ClusterSingletonManagerS
|
||||||
enterBarrier("after-leave")
|
enterBarrier("after-leave")
|
||||||
}
|
}
|
||||||
|
|
||||||
"take over when leader crashes in 5 nodes cluster" in within(35 seconds) {
|
"take over when leader crashes in 5 nodes cluster" in within(60 seconds) {
|
||||||
system.eventStream.publish(Mute(EventFilter.warning(pattern = ".*received dead letter from.*")))
|
system.eventStream.publish(Mute(EventFilter.warning(pattern = ".*received dead letter from.*")))
|
||||||
system.eventStream.publish(Mute(EventFilter.error(pattern = ".*Disassociated.*")))
|
system.eventStream.publish(Mute(EventFilter.error(pattern = ".*Disassociated.*")))
|
||||||
system.eventStream.publish(Mute(EventFilter.error(pattern = ".*Association failed.*")))
|
system.eventStream.publish(Mute(EventFilter.error(pattern = ".*Association failed.*")))
|
||||||
|
|
@ -324,12 +324,12 @@ class ClusterSingletonManagerSpec extends MultiNodeSpec(ClusterSingletonManagerS
|
||||||
verify(sortedClusterRoles(2), msg = 8, expectedCurrent = 0)
|
verify(sortedClusterRoles(2), msg = 8, expectedCurrent = 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
"take over when two leaders crash in 3 nodes cluster" in within(45 seconds) {
|
"take over when two leaders crash in 3 nodes cluster" in within(60 seconds) {
|
||||||
crash(sortedClusterRoles(2), sortedClusterRoles(3))
|
crash(sortedClusterRoles(2), sortedClusterRoles(3))
|
||||||
verify(sortedClusterRoles(4), msg = 9, expectedCurrent = 0)
|
verify(sortedClusterRoles(4), msg = 9, expectedCurrent = 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
"take over when leader crashes in 2 nodes cluster" in within(25 seconds) {
|
"take over when leader crashes in 2 nodes cluster" in within(60 seconds) {
|
||||||
crash(sortedClusterRoles(4))
|
crash(sortedClusterRoles(4))
|
||||||
verify(sortedClusterRoles(5), msg = 10, expectedCurrent = 0)
|
verify(sortedClusterRoles(5), msg = 10, expectedCurrent = 0)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue