* Stop singleton when self MemberDowned, #26336 * It's safer to stop singleton instance early in case of downing. * Instead of waiting for MemberRemoved and trying to hand over. * Stop ShardRegion when self MemberDowned, #26336 * Upper bound when waiting for seen in shutdownSelfWhenDown, #26336
This commit is contained in:
parent
8e2d378228
commit
ddada9a8e1
4 changed files with 165 additions and 2 deletions
|
|
@ -516,6 +516,12 @@ private[akka] class ShardRegion(
|
|||
else if (matchingRole(m))
|
||||
changeMembers(membersByAge.filterNot(_.uniqueAddress == m.uniqueAddress))
|
||||
|
||||
case MemberDowned(m) ⇒
|
||||
if (m.uniqueAddress == cluster.selfUniqueAddress) {
|
||||
log.info("Self downed, stopping ShardRegion [{}]", self.path)
|
||||
context.stop(self)
|
||||
}
|
||||
|
||||
case _: MemberEvent ⇒ // these are expected, no need to warn about them
|
||||
|
||||
case _ ⇒ unhandled(evt)
|
||||
|
|
|
|||
|
|
@ -511,7 +511,7 @@ class ClusterSingletonManager(
|
|||
require(!cluster.isTerminated, "Cluster node must not be terminated")
|
||||
|
||||
// subscribe to cluster changes, re-subscribe when restart
|
||||
cluster.subscribe(self, ClusterEvent.InitialStateAsEvents, classOf[MemberRemoved])
|
||||
cluster.subscribe(self, ClusterEvent.InitialStateAsEvents, classOf[MemberRemoved], classOf[MemberDowned])
|
||||
|
||||
setTimer(CleanupTimer, Cleanup, 1.minute, repeat = true)
|
||||
|
||||
|
|
@ -573,6 +573,10 @@ class ClusterSingletonManager(
|
|||
stay using YoungerData(oldestOption)
|
||||
}
|
||||
|
||||
case Event(MemberDowned(m), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
|
||||
logInfo("Self downed, stopping ClusterSingletonManager")
|
||||
stop()
|
||||
|
||||
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
|
||||
logInfo("Self removed, stopping ClusterSingletonManager")
|
||||
stop()
|
||||
|
|
@ -612,6 +616,10 @@ class ClusterSingletonManager(
|
|||
stay
|
||||
}
|
||||
|
||||
case Event(MemberDowned(m), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
|
||||
logInfo("Self downed, stopping ClusterSingletonManager")
|
||||
stop()
|
||||
|
||||
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
|
||||
logInfo("Self removed, stopping ClusterSingletonManager")
|
||||
stop()
|
||||
|
|
@ -722,6 +730,15 @@ class ClusterSingletonManager(
|
|||
// complete memberExitingProgress when handOverDone
|
||||
sender() ! Done // reply to ask
|
||||
stay
|
||||
|
||||
case Event(MemberDowned(m), OldestData(singleton, singletonTerminated)) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
|
||||
if (singletonTerminated) {
|
||||
logInfo("Self downed, stopping ClusterSingletonManager")
|
||||
stop()
|
||||
} else {
|
||||
logInfo("Self downed, stopping")
|
||||
gotoStopping(singleton)
|
||||
}
|
||||
}
|
||||
|
||||
when(WasOldest) {
|
||||
|
|
@ -761,6 +778,15 @@ class ClusterSingletonManager(
|
|||
sender() ! Done // reply to ask
|
||||
stay
|
||||
|
||||
case Event(MemberDowned(m), OldestData(singleton, singletonTerminated)) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
|
||||
if (singletonTerminated) {
|
||||
logInfo("Self downed, stopping ClusterSingletonManager")
|
||||
stop()
|
||||
} else {
|
||||
logInfo("Self downed, stopping")
|
||||
gotoStopping(singleton)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
def gotoHandingOver(singleton: ActorRef, singletonTerminated: Boolean, handOverTo: Option[ActorRef]): State = {
|
||||
|
|
@ -853,6 +879,10 @@ class ClusterSingletonManager(
|
|||
case Event(Cleanup, _) ⇒
|
||||
cleanupOverdueNotMemberAnyMore()
|
||||
stay
|
||||
case Event(MemberDowned(m), _) ⇒
|
||||
if (m.uniqueAddress == cluster.selfUniqueAddress)
|
||||
logInfo("Self downed, waiting for removal")
|
||||
stay
|
||||
}
|
||||
|
||||
onTransition {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Copyright (C) 2019 Lightbend Inc. <https://www.lightbend.com>
|
||||
*/
|
||||
|
||||
package akka.cluster.singleton
|
||||
|
||||
import scala.concurrent.duration._
|
||||
|
||||
import akka.actor.Actor
|
||||
import akka.actor.ActorRef
|
||||
import akka.actor.PoisonPill
|
||||
import akka.actor.Props
|
||||
import akka.cluster.Cluster
|
||||
import akka.cluster.MemberStatus
|
||||
import akka.remote.testconductor.RoleName
|
||||
import akka.remote.testkit.MultiNodeConfig
|
||||
import akka.remote.testkit.MultiNodeSpec
|
||||
import akka.remote.testkit.STMultiNodeSpec
|
||||
import akka.remote.transport.ThrottlerTransportAdapter
|
||||
import akka.testkit._
|
||||
import com.typesafe.config.ConfigFactory
|
||||
|
||||
object ClusterSingletonManagerDownedSpec extends MultiNodeConfig {
|
||||
val first = role("first")
|
||||
val second = role("second")
|
||||
val third = role("third")
|
||||
|
||||
commonConfig(ConfigFactory.parseString("""
|
||||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
"""))
|
||||
|
||||
testTransport(on = true)
|
||||
|
||||
case object EchoStarted
|
||||
case object EchoStopped
|
||||
/**
|
||||
* The singleton actor
|
||||
*/
|
||||
class Echo(testActor: ActorRef) extends Actor {
|
||||
testActor ! EchoStarted
|
||||
|
||||
override def postStop(): Unit = {
|
||||
testActor ! EchoStopped
|
||||
}
|
||||
|
||||
def receive = {
|
||||
case _ ⇒ sender() ! self
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class ClusterSingletonManagerDownedMultiJvmNode1 extends ClusterSingletonManagerDownedSpec
|
||||
class ClusterSingletonManagerDownedMultiJvmNode2 extends ClusterSingletonManagerDownedSpec
|
||||
class ClusterSingletonManagerDownedMultiJvmNode3 extends ClusterSingletonManagerDownedSpec
|
||||
|
||||
class ClusterSingletonManagerDownedSpec extends MultiNodeSpec(ClusterSingletonManagerDownedSpec) with STMultiNodeSpec with ImplicitSender {
|
||||
import ClusterSingletonManagerDownedSpec._
|
||||
|
||||
override def initialParticipants = roles.size
|
||||
|
||||
private val cluster = Cluster(system)
|
||||
|
||||
def join(from: RoleName, to: RoleName): Unit = {
|
||||
runOn(from) {
|
||||
cluster.join(node(to).address)
|
||||
createSingleton()
|
||||
}
|
||||
}
|
||||
|
||||
def createSingleton(): ActorRef = {
|
||||
system.actorOf(
|
||||
ClusterSingletonManager.props(
|
||||
singletonProps = Props(classOf[Echo], testActor),
|
||||
terminationMessage = PoisonPill,
|
||||
settings = ClusterSingletonManagerSettings(system)),
|
||||
name = "echo")
|
||||
}
|
||||
|
||||
"A ClusterSingletonManager downing" must {
|
||||
|
||||
"startup 3 node" in {
|
||||
join(first, first)
|
||||
join(second, first)
|
||||
join(third, first)
|
||||
within(15.seconds) {
|
||||
awaitAssert {
|
||||
cluster.state.members.size should ===(3)
|
||||
cluster.state.members.map(_.status) should ===(Set(MemberStatus.Up))
|
||||
}
|
||||
}
|
||||
runOn(first) {
|
||||
expectMsg(EchoStarted)
|
||||
}
|
||||
enterBarrier("started")
|
||||
}
|
||||
|
||||
"stop instance when member is downed" in {
|
||||
runOn(first) {
|
||||
testConductor.blackhole(first, third, ThrottlerTransportAdapter.Direction.Both).await
|
||||
testConductor.blackhole(second, third, ThrottlerTransportAdapter.Direction.Both).await
|
||||
|
||||
within(15.seconds) {
|
||||
awaitAssert {
|
||||
cluster.state.unreachable.size should ===(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
enterBarrier("blackhole-1")
|
||||
runOn(first) {
|
||||
// another blackhole so that second can't mark gossip as seen and thereby deferring shutdown of first
|
||||
testConductor.blackhole(first, second, ThrottlerTransportAdapter.Direction.Both).await
|
||||
cluster.down(node(second).address)
|
||||
cluster.down(cluster.selfAddress)
|
||||
// singleton instance stopped, before failure detection of first-second
|
||||
expectMsg(3.seconds, EchoStopped)
|
||||
}
|
||||
|
||||
enterBarrier("stopped")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -286,6 +286,7 @@ private[cluster] final class ClusterCoreSupervisor(joinConfigCompatChecker: Join
|
|||
private[cluster] object ClusterCoreDaemon {
|
||||
val NumberOfGossipsBeforeShutdownWhenLeaderExits = 5
|
||||
val MaxGossipsBeforeShuttingDownMyself = 5
|
||||
val MaxTicksBeforeShuttingDownMyself = 4
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -333,6 +334,7 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
|
|||
var seedNodeProcessCounter = 0 // for unique names
|
||||
var joinSeedNodesDeadline: Option[Deadline] = None
|
||||
var leaderActionCounter = 0
|
||||
var selfDownCounter = 0
|
||||
|
||||
var exitingTasksInProgress = false
|
||||
val selfExiting = Promise[Done]()
|
||||
|
|
@ -1112,7 +1114,7 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
|
|||
// status Down. The down commands should spread before we shutdown.
|
||||
val unreachable = membershipState.dcReachability.allUnreachableOrTerminated
|
||||
val downed = membershipState.dcMembers.collect { case m if m.status == Down ⇒ m.uniqueAddress }
|
||||
if (downed.forall(node ⇒ unreachable(node) || latestGossip.seenByNode(node))) {
|
||||
if (selfDownCounter >= MaxTicksBeforeShuttingDownMyself || downed.forall(node ⇒ unreachable(node) || latestGossip.seenByNode(node))) {
|
||||
// the reason for not shutting down immediately is to give the gossip a chance to spread
|
||||
// the downing information to other downed nodes, so that they can shutdown themselves
|
||||
logInfo("Shutting down myself")
|
||||
|
|
@ -1120,6 +1122,8 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
|
|||
// if other downed know that this node has seen the version
|
||||
gossipRandomN(MaxGossipsBeforeShuttingDownMyself)
|
||||
shutdown()
|
||||
} else {
|
||||
selfDownCounter += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue