Make cluster fault handling more robust, see #3030

* ClusterCoreDaemon and ClusterDomainEventPublisher can't be restarted because the state would be obsolete. * Add extra supervisor level for ClusterCoreDaemon and ClusterDomainEventPublisher, which will shutdown the member on failure in children. * Publish the final removed state on postStop in ClusterDomainEventPublisher. This also simplifies the removing process.
2013-02-11 10:40:01 +01:00 · 2013-02-11 10:40:01 +01:00 · cab78e5174
commit cab78e5174
parent b002bda23f
6 changed files with 86 additions and 32 deletions
--- a/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala
+++ b/akka-cluster/src/main/scala/akka/cluster/ClusterDaemon.scala
@ -9,8 +9,10 @@ import scala.collection.immutable
 import scala.concurrent.duration._
 import scala.concurrent.forkjoin.ThreadLocalRandom
 import scala.util.control.NonFatal
-import akka.actor.{ Actor, ActorLogging, ActorRef, Address, Cancellable, Props, ReceiveTimeout, RootActorPath, Scheduler }
+import akka.actor.{ Actor, ActorLogging, ActorRef, Address, Cancellable, Props, PoisonPill, ReceiveTimeout, RootActorPath, Scheduler }
 import akka.actor.OneForOneStrategy
 import akka.actor.Status.Failure
 import akka.actor.SupervisorStrategy.Stop
 import akka.event.EventStream
 import akka.pattern.ask
 import akka.util.Timeout
@ -103,6 +105,8 @@ private[cluster] object InternalClusterAction {
  case object GetClusterCoreRef
  case class PublisherCreated(publisher: ActorRef)
  /**
   * Comand to [[akka.cluster.ClusterDaemon]] to create a
   * [[akka.cluster.OnMemberUpListener]].
@ -122,8 +126,6 @@ private[cluster] object InternalClusterAction {
  case class PublishChanges(newGossip: Gossip) extends PublishMessage
  case class PublishEvent(event: ClusterDomainEvent) extends PublishMessage
  case object PublishStart extends PublishMessage
  case object PublishDone extends PublishMessage
  case object PublishDoneFinished extends PublishMessage
 }
 /**
@ -151,28 +153,61 @@ private[cluster] object ClusterLeaderAction {
 * Supervisor managing the different Cluster daemons.
 */
 private[cluster] final class ClusterDaemon(settings: ClusterSettings) extends Actor with ActorLogging {
-
+  import InternalClusterAction._
  // Important - don't use Cluster(context.system) here because that would
  // cause deadlock. The Cluster extension is currently being created and is waiting
  // for response from GetClusterCoreRef in its constructor.
-
+  val coreSupervisor = context.actorOf(Props[ClusterCoreSupervisor].
  val publisher = context.actorOf(Props[ClusterDomainEventPublisher].
    withDispatcher(context.props.dispatcher), name = "publisher")
  val core = context.actorOf(Props(new ClusterCoreDaemon(publisher)).
    withDispatcher(context.props.dispatcher), name = "core")
  context.actorOf(Props[ClusterHeartbeatReceiver].
    withDispatcher(context.props.dispatcher), name = "heartbeatReceiver")
  if (settings.MetricsEnabled) context.actorOf(Props(new ClusterMetricsCollector(publisher)).
    withDispatcher(context.props.dispatcher), name = "metrics")
  def receive = {
-    case InternalClusterAction.GetClusterCoreRef ⇒ sender ! core
+    case msg @ GetClusterCoreRef ⇒ coreSupervisor forward msg
-    case InternalClusterAction.AddOnMemberUpListener(code) ⇒
+    case AddOnMemberUpListener(code) ⇒
      context.actorOf(Props(new OnMemberUpListener(code)))
    case PublisherCreated(publisher) ⇒
      if (settings.MetricsEnabled) {
        // metrics must be started after core/publisher to be able
        // to inject the publisher ref to the ClusterMetricsCollector
        context.actorOf(Props(new ClusterMetricsCollector(publisher)).
          withDispatcher(context.props.dispatcher), name = "metrics")
      }
  }
 }
 /**
 * INTERNAL API.
 *
 * ClusterCoreDaemon and ClusterDomainEventPublisher can't be restarted because the state
 * would be obsolete. Shutdown the member if any those actors crashed.
 */
 private[cluster] final class ClusterCoreSupervisor extends Actor with ActorLogging {
  import InternalClusterAction._
  val publisher = context.actorOf(Props[ClusterDomainEventPublisher].
    withDispatcher(context.props.dispatcher), name = "publisher")
  val coreDaemon = context.watch(context.actorOf(Props(new ClusterCoreDaemon(publisher)).
    withDispatcher(context.props.dispatcher), name = "daemon"))
  context.parent ! PublisherCreated(publisher)
  override val supervisorStrategy =
    OneForOneStrategy() {
      case NonFatal(e) ⇒
        log.error(e, "Cluster node [{}] crashed, [{}] - shutting down...", Cluster(context.system).selfAddress, e.getMessage)
        self ! PoisonPill
        Stop
    }
  override def postStop(): Unit = Cluster(context.system).shutdown()
  def receive = {
    case InternalClusterAction.GetClusterCoreRef ⇒ sender ! coreDaemon
  }
 }
 /**
 * INTERNAL API.
 */
@ -196,7 +231,7 @@ private[cluster] final class ClusterCoreDaemon(publisher: ActorRef) extends Acto
   * Looks up and returns the remote cluster command connection for the specific address.
   */
  private def clusterCore(address: Address): ActorRef =
-    context.actorFor(RootActorPath(address) / "system" / "cluster" / "core")
+    context.actorFor(RootActorPath(address) / "system" / "cluster" / "core" / "daemon")
  val heartbeatSender = context.actorOf(Props[ClusterHeartbeatSender].
    withDispatcher(UseDispatcher), name = "heartbeatSender")
@ -381,14 +416,7 @@ private[cluster] final class ClusterCoreDaemon(publisher: ActorRef) extends Acto
   */
  def removing(address: Address): Unit = {
    log.info("Cluster Node [{}] - Node has been REMOVED by the leader - shutting down...", selfAddress)
-    // just cleaning up the gossip state
+    cluster.shutdown()
    latestGossip = Gossip.empty
    publish(latestGossip)
    context.become(removed)
    // make sure the final (removed) state is published
    // before shutting down
    implicit val timeout = Timeout(5 seconds)
    publisher ? PublishDone onComplete { case _ ⇒ cluster.shutdown() }
  }
  /**
--- a/akka-cluster/src/main/scala/akka/cluster/ClusterEvent.scala
+++ b/akka-cluster/src/main/scala/akka/cluster/ClusterEvent.scala
@ -341,6 +341,15 @@ private[cluster] final class ClusterDomainEventPublisher extends Actor with Acto
  var latestConvergedGossip: Gossip = Gossip.empty
  var bufferedEvents: immutable.IndexedSeq[ClusterDomainEvent] = Vector.empty
  override def preRestart(reason: Throwable, message: Option[Any]) {
    // don't postStop when restarted, no children to stop
  }
  override def postStop(): Unit = {
    // publish the final removed state before shutting down
    publishChanges(Gossip.empty)
  }
  def receive = {
    case PublishChanges(newGossip)            ⇒ publishChanges(newGossip)
    case currentStats: CurrentInternalStats   ⇒ publishInternalStats(currentStats)
@ -349,7 +358,6 @@ private[cluster] final class ClusterDomainEventPublisher extends Actor with Acto
    case Unsubscribe(subscriber, to)          ⇒ unsubscribe(subscriber, to)
    case PublishEvent(event)                  ⇒ publish(event)
    case PublishStart                         ⇒ publishStart()
    case PublishDone                          ⇒ publishDone(sender)
  }
  def eventStream: EventStream = context.system.eventStream
@ -435,11 +443,6 @@ private[cluster] final class ClusterDomainEventPublisher extends Actor with Acto
      publishCurrentClusterState(None)
    }
  def publishDone(receiver: ActorRef): Unit = {
    clearState()
    receiver ! PublishDoneFinished
  }
  def clearState(): Unit = {
    latestGossip = Gossip.empty
    latestConvergedGossip = Gossip.empty
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/LeaderLeavingSpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/LeaderLeavingSpec.scala
@ -59,9 +59,6 @@ abstract class LeaderLeavingSpec
          // verify that the LEADER is shut down
          awaitCond(cluster.isTerminated)
          // verify that the LEADER is REMOVED
          awaitCond(clusterView.status == Removed)
        } else {
          val leavingLatch = TestLatch()
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/MembershipChangeListenerExitingSpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/MembershipChangeListenerExitingSpec.scala
@ -51,7 +51,24 @@ abstract class MembershipChangeListenerExitingSpec
      }
      runOn(second) {
        val exitingLatch = TestLatch()
        val removedLatch = TestLatch()
        val secondAddress = address(second)
        cluster.subscribe(system.actorOf(Props(new Actor {
          def receive = {
            case state: CurrentClusterState ⇒
              if (state.members.exists(m ⇒ m.address == secondAddress && m.status == Exiting))
                exitingLatch.countDown()
            case MemberExited(m) if m.address == secondAddress ⇒
              exitingLatch.countDown()
            case MemberRemoved(m) if m.address == secondAddress ⇒
              removedLatch.countDown()
            case _ ⇒ // ignore
          }
        })), classOf[MemberEvent])
        enterBarrier("registered-listener")
        exitingLatch.await
        removedLatch.await
      }
      runOn(third) {
--- a/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeLeavingAndExitingAndBeingRemovedSpec.scala
+++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/NodeLeavingAndExitingAndBeingRemovedSpec.scala
@ -50,9 +50,8 @@ abstract class NodeLeavingAndExitingAndBeingRemovedSpec
      }
      runOn(second) {
-        // verify that the second node is shut down and has status REMOVED
+        // verify that the second node is shut down
        awaitCond(cluster.isTerminated, reaperWaitingTime)
        awaitCond(clusterView.status == MemberStatus.Removed, reaperWaitingTime)
      }
      enterBarrier("finished")
--- a/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala
+++ b/akka-cluster/src/test/scala/akka/cluster/ClusterSpec.scala
@ -91,5 +91,15 @@ class ClusterSpec extends AkkaSpec(ClusterSpec.config) with ImplicitSender {
      expectMsgClass(classOf[ClusterEvent.CurrentClusterState])
    }
    // this must be the last test step, since the cluster is shutdown
    "publish MemberRemoved when shutdown" in {
      cluster.subscribe(testActor, classOf[ClusterEvent.MemberRemoved])
      // first, is in response to the subscription
      expectMsgClass(classOf[ClusterEvent.CurrentClusterState])
      cluster.shutdown()
      expectMsgType[ClusterEvent.MemberRemoved].member.address must be(selfAddress)
    }
  }
 }