=clu #3871 Harden SurviveNetworkInstabilitySpec
* The problem was that the unreachability observed by second node was leaking from previous test step and when adding the blackhole, it could not heal and that caused the leader to not be able to remove the downed second node because some other nodes were still marked as unreachable. * The first node was not included in the the awaitAllReachable check in the previous step, and the order of awaitAllReachable and awaitMembersUp was wrong. * Included the awaitAllReachable check in assertCanTalk. * Changed to two-way blackhole and using barrier instead of scheduled event to trigger the exceptions when the blackhole was in place * We should investigate if unreachable observations from downed node can be excluded in the convergence check. Created separate ticket for that 3875.
This commit is contained in:
parent
b1b18cb086
commit
2548ebd727
1 changed files with 23 additions and 12 deletions
|
|
@ -52,12 +52,20 @@ object SurviveNetworkInstabilityMultiJvmSpec extends MultiNodeConfig {
|
||||||
|
|
||||||
def receive = {
|
def receive = {
|
||||||
case "hello" ⇒
|
case "hello" ⇒
|
||||||
context.system.scheduler.scheduleOnce(2.seconds, self, "boom")
|
context.actorSelection("/user/bad") ! self
|
||||||
sender() ! "hello"
|
sender() ! "hello"
|
||||||
case "boom" ⇒ throw new SimulatedException
|
case "boom" ⇒ throw new SimulatedException
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class BadGuy extends Actor {
|
||||||
|
var victims = Vector.empty[ActorRef]
|
||||||
|
def receive = {
|
||||||
|
case ref: ActorRef ⇒ victims :+= ref
|
||||||
|
case "boom" ⇒ victims foreach { _ ! "boom" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
class Echo extends Actor {
|
class Echo extends Actor {
|
||||||
def receive = {
|
def receive = {
|
||||||
case m ⇒ sender ! m
|
case m ⇒ sender ! m
|
||||||
|
|
@ -94,8 +102,14 @@ abstract class SurviveNetworkInstabilitySpec
|
||||||
}
|
}
|
||||||
|
|
||||||
system.actorOf(Props[Echo], "echo")
|
system.actorOf(Props[Echo], "echo")
|
||||||
|
val bad = system.actorOf(Props[BadGuy], "bad")
|
||||||
|
|
||||||
def assertCanTalk(alive: RoleName*): Unit = {
|
def assertCanTalk(alive: RoleName*): Unit = {
|
||||||
|
runOn(alive: _*) {
|
||||||
|
awaitAllReachable
|
||||||
|
}
|
||||||
|
enterBarrier("reachable-ok")
|
||||||
|
|
||||||
runOn(alive: _*) {
|
runOn(alive: _*) {
|
||||||
for (to ← alive) {
|
for (to ← alive) {
|
||||||
val sel = system.actorSelection(node(to) / "user" / "echo")
|
val sel = system.actorSelection(node(to) / "user" / "echo")
|
||||||
|
|
@ -141,7 +155,6 @@ abstract class SurviveNetworkInstabilitySpec
|
||||||
// unreachable they must accept gossip from first and second when their
|
// unreachable they must accept gossip from first and second when their
|
||||||
// broken connection has healed, otherwise they will be isolated forever.
|
// broken connection has healed, otherwise they will be isolated forever.
|
||||||
|
|
||||||
awaitAllReachable()
|
|
||||||
enterBarrier("after-2")
|
enterBarrier("after-2")
|
||||||
assertCanTalk(first, second, third, fourth, fifth)
|
assertCanTalk(first, second, third, fourth, fifth)
|
||||||
}
|
}
|
||||||
|
|
@ -168,8 +181,6 @@ abstract class SurviveNetworkInstabilitySpec
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
enterBarrier("repair-3")
|
enterBarrier("repair-3")
|
||||||
awaitAllReachable()
|
|
||||||
enterBarrier("after-3")
|
|
||||||
assertCanTalk((others :+ first): _*)
|
assertCanTalk((others :+ first): _*)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -199,8 +210,6 @@ abstract class SurviveNetworkInstabilitySpec
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
enterBarrier("repair-4")
|
enterBarrier("repair-4")
|
||||||
awaitAllReachable()
|
|
||||||
enterBarrier("after-4")
|
|
||||||
assertCanTalk((island1 ++ island2): _*)
|
assertCanTalk((island1 ++ island2): _*)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -241,13 +250,12 @@ abstract class SurviveNetworkInstabilitySpec
|
||||||
}
|
}
|
||||||
|
|
||||||
enterBarrier("repair-5")
|
enterBarrier("repair-5")
|
||||||
runOn((joining ++ others): _*) {
|
runOn((joining ++ others :+ first): _*) {
|
||||||
awaitAllReachable()
|
|
||||||
// eighth not joined yet
|
// eighth not joined yet
|
||||||
awaitMembersUp(roles.size - 1)
|
awaitMembersUp(roles.size - 1, timeout = remaining)
|
||||||
}
|
}
|
||||||
enterBarrier("after-5")
|
enterBarrier("after-5")
|
||||||
assertCanTalk((joining ++ others): _*)
|
assertCanTalk((joining ++ others :+ first): _*)
|
||||||
}
|
}
|
||||||
|
|
||||||
"down and remove quarantined node" taggedAs LongRunningTest in within(60.seconds) {
|
"down and remove quarantined node" taggedAs LongRunningTest in within(60.seconds) {
|
||||||
|
|
@ -278,12 +286,15 @@ abstract class SurviveNetworkInstabilitySpec
|
||||||
|
|
||||||
runOn(first) {
|
runOn(first) {
|
||||||
for (role ← others)
|
for (role ← others)
|
||||||
testConductor.blackhole(second, role, Direction.Send).await
|
testConductor.blackhole(role, second, Direction.Both).await
|
||||||
}
|
}
|
||||||
enterBarrier("blackhole-6")
|
enterBarrier("blackhole-6")
|
||||||
|
|
||||||
runOn(third) {
|
runOn(third) {
|
||||||
// undelivered system messages in RemoteChild on third should trigger QuarantinedEvent
|
// this will trigger Exception in RemoteChild on third, and the failures
|
||||||
|
// can't be reported to parent on second, resulting in too many outstanding
|
||||||
|
// system messages and quarantine
|
||||||
|
bad ! "boom"
|
||||||
within(10.seconds) {
|
within(10.seconds) {
|
||||||
expectMsgType[QuarantinedEvent].address should be(address(second))
|
expectMsgType[QuarantinedEvent].address should be(address(second))
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue