Fix shutdown/remove race as described by @rkuhn, see #2137

* Skip nodes removal
* Ignore removed client when enter barrier
* Change order of testConductor.shutdown and testConductor.removeNode
This commit is contained in:
Patrik Nordwall 2012-06-04 11:38:39 +02:00
parent e7cf92e72a
commit 52f122107c
6 changed files with 8 additions and 9 deletions

View file

@ -43,8 +43,8 @@ class ClientDowningNodeThatIsUnreachableSpec
testConductor.enter("all-up")
// kill 'third' node
testConductor.shutdown(third, 0)
testConductor.removeNode(third)
testConductor.shutdown(third, 0)
// mark 'third' node as DOWN
cluster.down(thirdAddress)

View file

@ -57,8 +57,8 @@ abstract class GossipingAccrualFailureDetectorSpec extends MultiNodeSpec(Gossipi
"mark node as 'unavailable' if a node in the cluster is shut down (and its heartbeats stops)" taggedAs LongRunningTest in {
runOn(first) {
testConductor.shutdown(third, 0)
testConductor.removeNode(third)
testConductor.shutdown(third, 0)
}
runOn(first, second) {

View file

@ -51,8 +51,8 @@ class LeaderDowningNodeThatIsUnreachableSpec
testConductor.enter("all-up")
// kill 'fourth' node
testConductor.shutdown(fourth, 0)
testConductor.removeNode(fourth)
testConductor.shutdown(fourth, 0)
testConductor.enter("down-fourth-node")
// --- HERE THE LEADER SHOULD DETECT FAILURE AND AUTO-DOWN THE UNREACHABLE NODE ---
@ -91,8 +91,8 @@ class LeaderDowningNodeThatIsUnreachableSpec
testConductor.enter("all-up")
// kill 'second' node
testConductor.shutdown(second, 0)
testConductor.removeNode(second)
testConductor.shutdown(second, 0)
testConductor.enter("down-second-node")
// --- HERE THE LEADER SHOULD DETECT FAILURE AND AUTO-DOWN THE UNREACHABLE NODE ---

View file

@ -65,8 +65,8 @@ abstract class LeaderElectionSpec extends MultiNodeSpec(LeaderElectionMultiJvmSp
case `controller`
testConductor.enter("before-shutdown")
testConductor.shutdown(leader, 0)
testConductor.removeNode(leader)
testConductor.shutdown(leader, 0)
testConductor.enter("after-shutdown", "after-down", "completed")
case `leader`

View file

@ -57,8 +57,8 @@ abstract class NodeShutdownSpec extends MultiNodeSpec(NodeShutdownMultiJvmSpec)
"become singleton cluster when one node is shutdown" taggedAs LongRunningTest in {
runOn(first) {
val secondAddress = node(second).address
testConductor.shutdown(second, 0)
testConductor.removeNode(second)
testConductor.shutdown(second, 0)
awaitUpConvergence(numberOfMembers = 1, canNotBePartOfMemberRing = Seq(secondAddress), 30.seconds)
cluster.isSingletonCluster must be(true)
assertLeader(first)

View file

@ -444,7 +444,6 @@ private[akka] class Controller(private var initialParticipants: Int, controllerP
nodes(node).fsm forward ToClient(TerminateMsg(exitValueOrKill))
}
case Remove(node)
nodes -= node
barrier ! BarrierCoordinator.RemoveClient(node)
}
case GetNodes sender ! nodes.keys
@ -540,8 +539,8 @@ private[akka] class BarrierCoordinator extends Actor with LoggingFSM[BarrierCoor
when(Waiting) {
case Event(EnterBarrier(name), d @ Data(clients, barrier, arrived))
if (name != barrier || clients.find(_.fsm == sender).isEmpty) throw WrongBarrier(name, sender, d)
val together = sender :: arrived
if (name != barrier) throw WrongBarrier(name, sender, d)
val together = if (clients.find(_.fsm == sender).isDefined) sender :: arrived else arrived
handleBarrier(d.copy(arrived = together))
case Event(RemoveClient(name), d @ Data(clients, barrier, arrived))
clients find (_.name == name) match {