Stop singleton and shards when self MemberDowned, #26336 (#26339)

* Stop singleton when self MemberDowned, #26336 * It's safer to stop singleton instance early in case of downing. * Instead of waiting for MemberRemoved and trying to hand over. * Stop ShardRegion when self MemberDowned, #26336 * Upper bound when waiting for seen in shutdownSelfWhenDown, #26336
2019-02-12 15:05:33 +01:00 · 2019-02-12 15:05:33 +01:00 · ddada9a8e1
commit ddada9a8e1
parent 8e2d378228
4 changed files with 165 additions and 2 deletions
--- a/akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ShardRegion.scala
+++ b/akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ShardRegion.scala
@ -516,6 +516,12 @@ private[akka] class ShardRegion(
      else if (matchingRole(m))
        changeMembers(membersByAge.filterNot(_.uniqueAddress == m.uniqueAddress))

+    case MemberDowned(m) ⇒
+      if (m.uniqueAddress == cluster.selfUniqueAddress) {
+        log.info("Self downed, stopping ShardRegion [{}]", self.path)
+        context.stop(self)
+      }
+
    case _: MemberEvent ⇒ // these are expected, no need to warn about them

    case _              ⇒ unhandled(evt)
--- a/akka-cluster-tools/src/main/scala/akka/cluster/singleton/ClusterSingletonManager.scala
+++ b/akka-cluster-tools/src/main/scala/akka/cluster/singleton/ClusterSingletonManager.scala
@ -511,7 +511,7 @@ class ClusterSingletonManager(
    require(!cluster.isTerminated, "Cluster node must not be terminated")

    // subscribe to cluster changes, re-subscribe when restart
-    cluster.subscribe(self, ClusterEvent.InitialStateAsEvents, classOf[MemberRemoved])
+    cluster.subscribe(self, ClusterEvent.InitialStateAsEvents, classOf[MemberRemoved], classOf[MemberDowned])

    setTimer(CleanupTimer, Cleanup, 1.minute, repeat = true)

@ -573,6 +573,10 @@ class ClusterSingletonManager(
        stay using YoungerData(oldestOption)
      }

+    case Event(MemberDowned(m), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
+      logInfo("Self downed, stopping ClusterSingletonManager")
+      stop()
+
    case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
      logInfo("Self removed, stopping ClusterSingletonManager")
      stop()
@ -612,6 +616,10 @@ class ClusterSingletonManager(
        stay
      }

+    case Event(MemberDowned(m), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
+      logInfo("Self downed, stopping ClusterSingletonManager")
+      stop()
+
    case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
      logInfo("Self removed, stopping ClusterSingletonManager")
      stop()
@ -722,6 +730,15 @@ class ClusterSingletonManager(
      // complete memberExitingProgress when handOverDone
      sender() ! Done // reply to ask
      stay
+
+    case Event(MemberDowned(m), OldestData(singleton, singletonTerminated)) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
+      if (singletonTerminated) {
+        logInfo("Self downed, stopping ClusterSingletonManager")
+        stop()
+      } else {
+        logInfo("Self downed, stopping")
+        gotoStopping(singleton)
+      }
  }

  when(WasOldest) {
@ -761,6 +778,15 @@ class ClusterSingletonManager(
      sender() ! Done // reply to ask
      stay

+    case Event(MemberDowned(m), OldestData(singleton, singletonTerminated)) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
+      if (singletonTerminated) {
+        logInfo("Self downed, stopping ClusterSingletonManager")
+        stop()
+      } else {
+        logInfo("Self downed, stopping")
+        gotoStopping(singleton)
+      }
+
  }

  def gotoHandingOver(singleton: ActorRef, singletonTerminated: Boolean, handOverTo: Option[ActorRef]): State = {
@ -853,6 +879,10 @@ class ClusterSingletonManager(
    case Event(Cleanup, _) ⇒
      cleanupOverdueNotMemberAnyMore()
      stay
+    case Event(MemberDowned(m), _) ⇒
+      if (m.uniqueAddress == cluster.selfUniqueAddress)
+        logInfo("Self downed, waiting for removal")
+      stay
  }

  onTransition {
--- a/akka-cluster-tools/src/multi-jvm/scala/akka/cluster/singleton/ClusterSingletonManagerDownedSpec.scala
+++ b/akka-cluster-tools/src/multi-jvm/scala/akka/cluster/singleton/ClusterSingletonManagerDownedSpec.scala
@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2019 Lightbend Inc. <https://www.lightbend.com>
+ */
+
+package akka.cluster.singleton
+
+import scala.concurrent.duration._
+
+import akka.actor.Actor
+import akka.actor.ActorRef
+import akka.actor.PoisonPill
+import akka.actor.Props
+import akka.cluster.Cluster
+import akka.cluster.MemberStatus
+import akka.remote.testconductor.RoleName
+import akka.remote.testkit.MultiNodeConfig
+import akka.remote.testkit.MultiNodeSpec
+import akka.remote.testkit.STMultiNodeSpec
+import akka.remote.transport.ThrottlerTransportAdapter
+import akka.testkit._
+import com.typesafe.config.ConfigFactory
+
+object ClusterSingletonManagerDownedSpec extends MultiNodeConfig {
+  val first = role("first")
+  val second = role("second")
+  val third = role("third")
+
+  commonConfig(ConfigFactory.parseString("""
+    akka.loglevel = INFO
+    akka.actor.provider = "cluster"
+    akka.remote.log-remote-lifecycle-events = off
+    """))
+
+  testTransport(on = true)
+
+  case object EchoStarted
+  case object EchoStopped
+  /**
+   * The singleton actor
+   */
+  class Echo(testActor: ActorRef) extends Actor {
+    testActor ! EchoStarted
+
+    override def postStop(): Unit = {
+      testActor ! EchoStopped
+    }
+
+    def receive = {
+      case _ ⇒ sender() ! self
+    }
+  }
+}
+
+class ClusterSingletonManagerDownedMultiJvmNode1 extends ClusterSingletonManagerDownedSpec
+class ClusterSingletonManagerDownedMultiJvmNode2 extends ClusterSingletonManagerDownedSpec
+class ClusterSingletonManagerDownedMultiJvmNode3 extends ClusterSingletonManagerDownedSpec
+
+class ClusterSingletonManagerDownedSpec extends MultiNodeSpec(ClusterSingletonManagerDownedSpec) with STMultiNodeSpec with ImplicitSender {
+  import ClusterSingletonManagerDownedSpec._
+
+  override def initialParticipants = roles.size
+
+  private val cluster = Cluster(system)
+
+  def join(from: RoleName, to: RoleName): Unit = {
+    runOn(from) {
+      cluster.join(node(to).address)
+      createSingleton()
+    }
+  }
+
+  def createSingleton(): ActorRef = {
+    system.actorOf(
+      ClusterSingletonManager.props(
+        singletonProps = Props(classOf[Echo], testActor),
+        terminationMessage = PoisonPill,
+        settings = ClusterSingletonManagerSettings(system)),
+      name = "echo")
+  }
+
+  "A ClusterSingletonManager downing" must {
+
+    "startup 3 node" in {
+      join(first, first)
+      join(second, first)
+      join(third, first)
+      within(15.seconds) {
+        awaitAssert {
+          cluster.state.members.size should ===(3)
+          cluster.state.members.map(_.status) should ===(Set(MemberStatus.Up))
+        }
+      }
+      runOn(first) {
+        expectMsg(EchoStarted)
+      }
+      enterBarrier("started")
+    }
+
+    "stop instance when member is downed" in {
+      runOn(first) {
+        testConductor.blackhole(first, third, ThrottlerTransportAdapter.Direction.Both).await
+        testConductor.blackhole(second, third, ThrottlerTransportAdapter.Direction.Both).await
+
+        within(15.seconds) {
+          awaitAssert {
+            cluster.state.unreachable.size should ===(1)
+          }
+        }
+      }
+      enterBarrier("blackhole-1")
+      runOn(first) {
+        // another blackhole so that second can't mark gossip as seen and thereby deferring shutdown of first
+        testConductor.blackhole(first, second, ThrottlerTransportAdapter.Direction.Both).await
+        cluster.down(node(second).address)
+        cluster.down(cluster.selfAddress)
+        // singleton instance stopped, before failure detection of first-second
+        expectMsg(3.seconds, EchoStopped)
+      }
+
+      enterBarrier("stopped")
+    }
+  }
+}
--- a/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala
+++ b/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala
@ -286,6 +286,7 @@ private[cluster] final class ClusterCoreSupervisor(joinConfigCompatChecker: Join
 private[cluster] object ClusterCoreDaemon {
  val NumberOfGossipsBeforeShutdownWhenLeaderExits = 5
  val MaxGossipsBeforeShuttingDownMyself = 5
+  val MaxTicksBeforeShuttingDownMyself = 4

 }

@ -333,6 +334,7 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
  var seedNodeProcessCounter = 0 // for unique names
  var joinSeedNodesDeadline: Option[Deadline] = None
  var leaderActionCounter = 0
+  var selfDownCounter = 0

  var exitingTasksInProgress = false
  val selfExiting = Promise[Done]()
@ -1112,7 +1114,7 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
      // status Down. The down commands should spread before we shutdown.
      val unreachable = membershipState.dcReachability.allUnreachableOrTerminated
      val downed = membershipState.dcMembers.collect { case m if m.status == Down ⇒ m.uniqueAddress }
-      if (downed.forall(node ⇒ unreachable(node) || latestGossip.seenByNode(node))) {
+      if (selfDownCounter >= MaxTicksBeforeShuttingDownMyself || downed.forall(node ⇒ unreachable(node) || latestGossip.seenByNode(node))) {
        // the reason for not shutting down immediately is to give the gossip a chance to spread
        // the downing information to other downed nodes, so that they can shutdown themselves
        logInfo("Shutting down myself")
@ -1120,6 +1122,8 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
        // if other downed know that this node has seen the version
        gossipRandomN(MaxGossipsBeforeShuttingDownMyself)
        shutdown()
+      } else {
+        selfDownCounter += 1
      }
    }
  }