Cluster singleton failure due to down-removed, see #3130
* The scenario was that previous leader left. * The problem was that the new leader got MemberRemoved before it got the HandOverDone and therefore missed the hand over data. * Solved by not changing the singleton to leader when receiving MemberRemoved and instead do that on normal HandOverDone or in failure cases after retry timeout. * The reason for this bug was the new transition from Down to Removed and that there is now no MemberDowned event. Previously this was only triggered by MemberDowned (not MemberRemoved) and that was safe because that was "always" preceeded by unreachable. * The new solution means that it will take longer for new singleton to startup in case of unreachable previous leader, but I don't want to trigger it on MemberUnreachable because it might in the future be possible to switch it back to reachable.
This commit is contained in:
parent
01bfb9378e
commit
d98a7ef1e8
3 changed files with 19 additions and 14 deletions
|
|
@ -314,7 +314,7 @@ class ClusterSingletonManagerSpec extends MultiNodeSpec(ClusterSingletonManagerS
|
|||
enterBarrier("after-leave")
|
||||
}
|
||||
|
||||
"take over when leader crashes in 5 nodes cluster" in within(35 seconds) {
|
||||
"take over when leader crashes in 5 nodes cluster" in within(60 seconds) {
|
||||
system.eventStream.publish(Mute(EventFilter.warning(pattern = ".*received dead letter from.*")))
|
||||
system.eventStream.publish(Mute(EventFilter.error(pattern = ".*Disassociated.*")))
|
||||
system.eventStream.publish(Mute(EventFilter.error(pattern = ".*Association failed.*")))
|
||||
|
|
@ -324,12 +324,12 @@ class ClusterSingletonManagerSpec extends MultiNodeSpec(ClusterSingletonManagerS
|
|||
verify(sortedClusterRoles(2), msg = 8, expectedCurrent = 0)
|
||||
}
|
||||
|
||||
"take over when two leaders crash in 3 nodes cluster" in within(45 seconds) {
|
||||
"take over when two leaders crash in 3 nodes cluster" in within(60 seconds) {
|
||||
crash(sortedClusterRoles(2), sortedClusterRoles(3))
|
||||
verify(sortedClusterRoles(4), msg = 9, expectedCurrent = 0)
|
||||
}
|
||||
|
||||
"take over when leader crashes in 2 nodes cluster" in within(25 seconds) {
|
||||
"take over when leader crashes in 2 nodes cluster" in within(60 seconds) {
|
||||
crash(sortedClusterRoles(4))
|
||||
verify(sortedClusterRoles(5), msg = 10, expectedCurrent = 0)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue