pekko/akka-cluster/src/multi-jvm/scala/akka/cluster/DowningWhenOtherHasQuarantinedThisActorSystemSpec.scala
Johan Andrén 26c56e5825
SBR downing when a node is quarantined from the other side (#29737)
* SBR now downs a node when it notices that it has been quarantined from other nodes #29565

* Them mima excludes

* Review feedback mostly addressed

* One more stale comment removed

* More stress

* Ignore if remote quarantining node is not part of cluster

* Preliminary (untested) keepalive server support

* Completed reproducer of scenario discussed in PR

* Fix weird wrong extends in multi-jvm tests

* Put the test transport dropping after control junction to also drop control messages on blackhole.

* Test cleanup/review feedback addressed

* Ping from both nodes of side 1

Co-authored-by: Renato Cavalcanti <renato@cavalcanti.be>

* Add some debug logging to test to nail down failure cause

* Log when InboundTestStage lets messages through because no association yet

Co-authored-by: Renato Cavalcanti <renato@cavalcanti.be>
2021-01-28 09:49:15 +00:00

136 lines
4.8 KiB
Scala

/*
* Copyright (C) 2009-2020 Lightbend Inc. <https://www.lightbend.com>
*/
package akka.cluster
import akka.actor.ActorRef
import akka.actor.Identify
import akka.actor.RootActorPath
import scala.concurrent.duration._
import akka.remote.artery.ArterySettings
import akka.remote.artery.ThisActorSystemQuarantinedEvent
import akka.remote.testkit.MultiNodeConfig
import akka.remote.testkit.MultiNodeSpec
import akka.remote.transport.ThrottlerTransportAdapter
import akka.testkit.LongRunningTest
import com.typesafe.config.ConfigFactory
object DowningWhenOtherHasQuarantinedThisActorSystemSpec extends MultiNodeConfig {
val first = role("first")
val second = role("second")
val third = role("third")
commonConfig(
debugConfig(on = false)
.withFallback(MultiNodeClusterSpec.clusterConfig)
.withFallback(
ConfigFactory.parseString("""
akka.remote.artery.enabled = on
akka.cluster.downing-provider-class = "akka.cluster.sbr.SplitBrainResolverProvider"
# speed up decision
akka.cluster.split-brain-resolver.stable-after = 5s
""")))
// exaggerate the timing issue by ,making the second node decide slower
// this is to more consistently repeat the scenario where the other side completes downing
// while the isolated part still has not made a decision and then see quarantined connections from the other nodes
nodeConfig(second)(ConfigFactory.parseString("akka.cluster.split-brain-resolver.stable-after = 15s"))
testTransport(on = true)
}
class DowningWhenOtherHasQuarantinedThisActorSystemMultiJvmNode1
extends DowningWhenOtherHasQuarantinedThisActorSystemSpec
class DowningWhenOtherHasQuarantinedThisActorSystemMultiJvmNode2
extends DowningWhenOtherHasQuarantinedThisActorSystemSpec
class DowningWhenOtherHasQuarantinedThisActorSystemMultiJvmNode3
extends DowningWhenOtherHasQuarantinedThisActorSystemSpec
abstract class DowningWhenOtherHasQuarantinedThisActorSystemSpec
extends MultiNodeSpec(DowningWhenOtherHasQuarantinedThisActorSystemSpec)
with MultiNodeClusterSpec {
import DowningWhenOtherHasQuarantinedThisActorSystemSpec._
"Cluster node downed by other" must {
if (!ArterySettings(system.settings.config.getConfig("akka.remote.artery")).Enabled) {
// this feature only works in Artery, because classic remoting will not accept connections from
// a quarantined node, and that is too high risk of introducing regressions if changing that
pending
}
"join cluster" taggedAs LongRunningTest in {
awaitClusterUp(first, second, third)
enterBarrier("after-1")
}
"down itself" taggedAs LongRunningTest in {
runOn(first) {
testConductor.blackhole(first, second, ThrottlerTransportAdapter.Direction.Both).await
testConductor.blackhole(third, second, ThrottlerTransportAdapter.Direction.Both).await
}
enterBarrier("blackhole")
within(15.seconds) {
runOn(first) {
awaitAssert {
cluster.state.unreachable.map(_.address) should ===(Set(address(second)))
}
awaitAssert {
// second downed and removed
cluster.state.members.map(_.address) should ===(Set(address(first), address(third)))
}
}
runOn(second) {
awaitAssert {
cluster.state.unreachable.map(_.address) should ===(Set(address(first), address(third)))
}
}
}
enterBarrier("down-second")
runOn(first) {
testConductor.passThrough(first, second, ThrottlerTransportAdapter.Direction.Both).await
testConductor.passThrough(third, second, ThrottlerTransportAdapter.Direction.Both).await
}
enterBarrier("pass-through")
runOn(second) {
within(10.seconds) {
awaitAssert {
// try to ping first (Cluster Heartbeat messages will not trigger the Quarantine message)
system.actorSelection(RootActorPath(first) / "user").tell(Identify(None), ActorRef.noSender)
// shutting down itself triggered by ThisActorSystemQuarantinedEvent
cluster.isTerminated should ===(true)
}
}
}
enterBarrier("after-2")
}
"not be triggered by another node shutting down" taggedAs LongRunningTest in {
runOn(first) {
system.eventStream.subscribe(testActor, classOf[ThisActorSystemQuarantinedEvent])
}
enterBarrier("subscribing")
runOn(third) {
cluster.shutdown()
}
runOn(first) {
val sel = system.actorSelection(RootActorPath(third) / "user")
(1 to 25).foreach { _ =>
sel.tell(Identify(None), ActorRef.noSender) // try to ping third
expectNoMessage(200.millis) // no ThisActorSystemQuarantinedEvent
}
}
enterBarrier("after-2")
}
}
}