=clu #3871 Harden SurviveNetworkInstabilitySpec

* The problem was that the unreachability observed by second node was leaking from previous test step and when adding the blackhole, it could not heal and that caused the leader to not be able to remove the downed second node because some other nodes were still marked as unreachable. * The first node was not included in the the awaitAllReachable check in the previous step, and the order of awaitAllReachable and awaitMembersUp was wrong. * Included the awaitAllReachable check in assertCanTalk. * Changed to two-way blackhole and using barrier instead of scheduled event to trigger the exceptions when the blackhole was in place * We should investigate if unreachable observations from downed node can be excluded in the convergence check. Created separate ticket for that 3875.
2014-02-14 11:48:42 +01:00 · 2014-02-14 11:48:42 +01:00 · 2548ebd727
commit 2548ebd727
parent b1b18cb086
1 changed files with 23 additions and 12 deletions
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/SurviveNetworkInstabilitySpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/SurviveNetworkInstabilitySpec.scala
@ -52,12 +52,20 @@ object SurviveNetworkInstabilityMultiJvmSpec extends MultiNodeConfig {

    def receive = {
      case "hello" ⇒
-        context.system.scheduler.scheduleOnce(2.seconds, self, "boom")
+        context.actorSelection("/user/bad") ! self
        sender() ! "hello"
      case "boom" ⇒ throw new SimulatedException
    }
  }

+  class BadGuy extends Actor {
+    var victims = Vector.empty[ActorRef]
+    def receive = {
+      case ref: ActorRef ⇒ victims :+= ref
+      case "boom"        ⇒ victims foreach { _ ! "boom" }
+    }
+  }
+
  class Echo extends Actor {
    def receive = {
      case m ⇒ sender ! m
@ -94,8 +102,14 @@ abstract class SurviveNetworkInstabilitySpec
  }

  system.actorOf(Props[Echo], "echo")
+  val bad = system.actorOf(Props[BadGuy], "bad")

  def assertCanTalk(alive: RoleName*): Unit = {
+    runOn(alive: _*) {
+      awaitAllReachable
+    }
+    enterBarrier("reachable-ok")
+
    runOn(alive: _*) {
      for (to ← alive) {
        val sel = system.actorSelection(node(to) / "user" / "echo")
@ -141,7 +155,6 @@ abstract class SurviveNetworkInstabilitySpec
      // unreachable they must accept gossip from first and second when their
      // broken connection has healed, otherwise they will be isolated forever.

-      awaitAllReachable()
      enterBarrier("after-2")
      assertCanTalk(first, second, third, fourth, fifth)
    }
@ -168,8 +181,6 @@ abstract class SurviveNetworkInstabilitySpec
        }
      }
      enterBarrier("repair-3")
-      awaitAllReachable()
-      enterBarrier("after-3")
      assertCanTalk((others :+ first): _*)
    }

@ -199,8 +210,6 @@ abstract class SurviveNetworkInstabilitySpec
        }
      }
      enterBarrier("repair-4")
-      awaitAllReachable()
-      enterBarrier("after-4")
      assertCanTalk((island1 ++ island2): _*)
    }

@ -241,13 +250,12 @@ abstract class SurviveNetworkInstabilitySpec
      }

      enterBarrier("repair-5")
-      runOn((joining ++ others): _*) {
-        awaitAllReachable()
+      runOn((joining ++ others :+ first): _*) {
        // eighth not joined yet
-        awaitMembersUp(roles.size - 1)
+        awaitMembersUp(roles.size - 1, timeout = remaining)
      }
      enterBarrier("after-5")
-      assertCanTalk((joining ++ others): _*)
+      assertCanTalk((joining ++ others :+ first): _*)
    }

    "down and remove quarantined node" taggedAs LongRunningTest in within(60.seconds) {
@ -278,12 +286,15 @@ abstract class SurviveNetworkInstabilitySpec

      runOn(first) {
        for (role ← others)
-          testConductor.blackhole(second, role, Direction.Send).await
+          testConductor.blackhole(role, second, Direction.Both).await
      }
      enterBarrier("blackhole-6")

      runOn(third) {
-        // undelivered system messages in RemoteChild on third should trigger QuarantinedEvent
+        // this will trigger Exception in RemoteChild on third, and the failures
+        // can't be reported to parent on second, resulting in too many outstanding
+        // system messages and quarantine
+        bad ! "boom"
        within(10.seconds) {
          expectMsgType[QuarantinedEvent].address should be(address(second))
        }