Fix shutdown/remove race as described by @rkuhn, see #2137

* Skip nodes removal * Ignore removed client when enter barrier * Change order of testConductor.shutdown and testConductor.removeNode
2012-06-04 11:38:39 +02:00 · 2012-06-04 11:38:39 +02:00 · 52f122107c
commit 52f122107c
parent e7cf92e72a
6 changed files with 8 additions and 9 deletions
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/ClientDowningNodeThatIsUnreachableSpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/ClientDowningNodeThatIsUnreachableSpec.scala
@ -43,8 +43,8 @@ class ClientDowningNodeThatIsUnreachableSpec
        testConductor.enter("all-up")

        // kill 'third' node
-        testConductor.shutdown(third, 0)
        testConductor.removeNode(third)
+        testConductor.shutdown(third, 0)

        // mark 'third' node as DOWN
        cluster.down(thirdAddress)
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/GossipingAccrualFailureDetectorSpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/GossipingAccrualFailureDetectorSpec.scala
@ -57,8 +57,8 @@ abstract class GossipingAccrualFailureDetectorSpec extends MultiNodeSpec(Gossipi

    "mark node as 'unavailable' if a node in the cluster is shut down (and its heartbeats stops)" taggedAs LongRunningTest in {
      runOn(first) {
-        testConductor.shutdown(third, 0)
        testConductor.removeNode(third)
+        testConductor.shutdown(third, 0)
      }

      runOn(first, second) {
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/LeaderDowningNodeThatIsUnreachableSpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/LeaderDowningNodeThatIsUnreachableSpec.scala
@ -51,8 +51,8 @@ class LeaderDowningNodeThatIsUnreachableSpec
        testConductor.enter("all-up")

        // kill 'fourth' node
-        testConductor.shutdown(fourth, 0)
        testConductor.removeNode(fourth)
+        testConductor.shutdown(fourth, 0)
        testConductor.enter("down-fourth-node")

        // --- HERE THE LEADER SHOULD DETECT FAILURE AND AUTO-DOWN THE UNREACHABLE NODE ---
@ -91,8 +91,8 @@ class LeaderDowningNodeThatIsUnreachableSpec
        testConductor.enter("all-up")

        // kill 'second' node
-        testConductor.shutdown(second, 0)
        testConductor.removeNode(second)
+        testConductor.shutdown(second, 0)
        testConductor.enter("down-second-node")

        // --- HERE THE LEADER SHOULD DETECT FAILURE AND AUTO-DOWN THE UNREACHABLE NODE ---
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/LeaderElectionSpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/LeaderElectionSpec.scala
@ -65,8 +65,8 @@ abstract class LeaderElectionSpec extends MultiNodeSpec(LeaderElectionMultiJvmSp

        case `controller` ⇒
          testConductor.enter("before-shutdown")
-          testConductor.shutdown(leader, 0)
          testConductor.removeNode(leader)
+          testConductor.shutdown(leader, 0)
          testConductor.enter("after-shutdown", "after-down", "completed")

        case `leader` ⇒
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeShutdownSpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeShutdownSpec.scala
@ -57,8 +57,8 @@ abstract class NodeShutdownSpec extends MultiNodeSpec(NodeShutdownMultiJvmSpec)
    "become singleton cluster when one node is shutdown" taggedAs LongRunningTest in {
      runOn(first) {
        val secondAddress = node(second).address
-        testConductor.shutdown(second, 0)
        testConductor.removeNode(second)
+        testConductor.shutdown(second, 0)
        awaitUpConvergence(numberOfMembers = 1, canNotBePartOfMemberRing = Seq(secondAddress), 30.seconds)
        cluster.isSingletonCluster must be(true)
        assertLeader(first)
--- a/akka-remote-tests/src/main/scala/akka/remote/testconductor/Conductor.scala
+++ b/akka-remote-tests/src/main/scala/akka/remote/testconductor/Conductor.scala
@ -444,7 +444,6 @@ private[akka] class Controller(private var initialParticipants: Int, controllerP
            nodes(node).fsm forward ToClient(TerminateMsg(exitValueOrKill))
          }
        case Remove(node) ⇒
-          nodes -= node
          barrier ! BarrierCoordinator.RemoveClient(node)
      }
    case GetNodes    ⇒ sender ! nodes.keys
@ -540,8 +539,8 @@ private[akka] class BarrierCoordinator extends Actor with LoggingFSM[BarrierCoor

  when(Waiting) {
    case Event(EnterBarrier(name), d @ Data(clients, barrier, arrived)) ⇒
-      if (name != barrier || clients.find(_.fsm == sender).isEmpty) throw WrongBarrier(name, sender, d)
-      val together = sender :: arrived
+      if (name != barrier) throw WrongBarrier(name, sender, d)
+      val together = if (clients.find(_.fsm == sender).isDefined) sender :: arrived else arrived
      handleBarrier(d.copy(arrived = together))
    case Event(RemoveClient(name), d @ Data(clients, barrier, arrived)) ⇒
      clients find (_.name == name) match {