add CoordinatedShutdown, #21537

* CoordinatedShutdown that can run tasks for configured phases in order (DAG)
* coordinate handover/shutdown of singleton with cluster exiting/shutdown
* phase config obj with depends-on list
* integrate graceful leaving of sharding in coordinated shutdown
* add timeout and recover
* add some missing artery ports to tests
* leave via CoordinatedShutdown.run
* optionally exit-jvm in last phase
* run via jvm shutdown hook
* send ExitingConfirmed to leader before shutdown of Exiting
  to not have to wait for failure detector to mark it as
  unreachable before removing
* the unreachable signal is still kept as a safe guard if
  message is lost or leader dies
* PhaseClusterExiting vs MemberExited in ClusterSingletonManager
* terminate ActorSystem when cluster shutdown (via Down)
* add more predefined and custom phases
* reference documentation
* migration guide
* problem when the leader order was sys2, sys1, sys3,
  then sys3 could not perform it's duties and move Leving sys1 to
  Exiting because it was observing sys1 as unreachable
* exclude Leaving with exitingConfirmed from convergence condidtion
This commit is contained in:
Patrik Nordwall 2016-12-01 18:49:38 +01:00
parent 4a9c753710
commit 84ade6fdc3
69 changed files with 1778 additions and 339 deletions

View file

@ -25,6 +25,11 @@ import akka.AkkaException
import akka.actor.NoSerializationVerificationNeeded
import akka.cluster.UniqueAddress
import akka.cluster.ClusterEvent
import scala.concurrent.Promise
import akka.Done
import akka.actor.CoordinatedShutdown
import akka.pattern.ask
import akka.util.Timeout
object ClusterSingletonManagerSettings {
@ -196,6 +201,7 @@ object ClusterSingletonManager {
final case class StoppingData(singleton: ActorRef) extends Data
case object EndData extends Data
final case class DelayedMemberRemoved(member: Member)
case object SelfExiting
val HandOverRetryTimer = "hand-over-retry"
val TakeOverRetryTimer = "take-over-retry"
@ -236,6 +242,17 @@ object ClusterSingletonManager {
// subscribe to MemberEvent, re-subscribe when restart
override def preStart(): Unit = {
cluster.subscribe(self, classOf[MemberEvent])
// It's a delicate difference between CoordinatedShutdown.PhaseClusterExiting and MemberExited.
// MemberExited event is published immediately (leader may have performed that transition on other node),
// and that will trigger run of CoordinatedShutdown, while PhaseClusterExiting will happen later.
// Using PhaseClusterExiting in the singleton because the graceful shutdown of sharding region
// should preferably complete before stopping the singleton sharding coordinator on same node.
val coordShutdown = CoordinatedShutdown(context.system)
coordShutdown.addTask(CoordinatedShutdown.PhaseClusterExiting, "singleton-exiting-1") { ()
implicit val timeout = Timeout(coordShutdown.timeout(CoordinatedShutdown.PhaseClusterExiting))
self.ask(SelfExiting).mapTo[Done]
}
}
override def postStop(): Unit = cluster.unsubscribe(self)
@ -285,8 +302,12 @@ object ClusterSingletonManager {
def receive = {
case state: CurrentClusterState handleInitial(state)
case MemberUp(m) add(m)
case mEvent: MemberEvent if (mEvent.isInstanceOf[MemberExited] || mEvent.isInstanceOf[MemberRemoved])
remove(mEvent.member)
case MemberRemoved(m, _) remove(m)
case MemberExited(m) if m.uniqueAddress != cluster.selfUniqueAddress
remove(m)
case SelfExiting
remove(cluster.readView.self)
sender() ! Done // reply to ask
case GetNext if changes.isEmpty
context.become(deliverNext, discardOld = false)
case GetNext
@ -301,16 +322,31 @@ object ClusterSingletonManager {
context.unbecome()
case MemberUp(m)
add(m)
if (changes.nonEmpty) {
sendFirstChange()
context.unbecome()
}
case mEvent: MemberEvent if (mEvent.isInstanceOf[MemberExited] || mEvent.isInstanceOf[MemberRemoved])
remove(mEvent.member)
if (changes.nonEmpty) {
sendFirstChange()
context.unbecome()
}
deliverChanges
case MemberRemoved(m, _)
remove(m)
deliverChanges()
case MemberExited(m) if m.uniqueAddress != cluster.selfUniqueAddress
remove(m)
deliverChanges()
case SelfExiting
remove(cluster.readView.self)
deliverChanges()
sender() ! Done // reply to ask
}
def deliverChanges(): Unit = {
if (changes.nonEmpty) {
sendFirstChange()
context.unbecome()
}
}
override def unhandled(msg: Any): Unit = {
msg match {
case _: MemberEvent // ok, silence
case _ super.unhandled(msg)
}
}
}
@ -422,6 +458,16 @@ class ClusterSingletonManager(
removed = removed filter { case (_, deadline) deadline.hasTimeLeft }
}
// for CoordinatedShutdown
val coordShutdown = CoordinatedShutdown(context.system)
val memberExitingProgress = Promise[Done]()
coordShutdown.addTask(CoordinatedShutdown.PhaseClusterExiting, "wait-singleton-exiting")(()
memberExitingProgress.future)
coordShutdown.addTask(CoordinatedShutdown.PhaseClusterExiting, "singleton-exiting-2") { ()
implicit val timeout = Timeout(coordShutdown.timeout(CoordinatedShutdown.PhaseClusterExiting))
self.ask(SelfExiting).mapTo[Done]
}
def logInfo(message: String): Unit =
if (LogInfo) log.info(message)
@ -436,7 +482,7 @@ class ClusterSingletonManager(
require(!cluster.isTerminated, "Cluster node must not be terminated")
// subscribe to cluster changes, re-subscribe when restart
cluster.subscribe(self, ClusterEvent.InitialStateAsEvents, classOf[MemberExited], classOf[MemberRemoved])
cluster.subscribe(self, ClusterEvent.InitialStateAsEvents, classOf[MemberRemoved])
setTimer(CleanupTimer, Cleanup, 1.minute, repeat = true)
@ -448,6 +494,7 @@ class ClusterSingletonManager(
override def postStop(): Unit = {
cancelTimer(CleanupTimer)
cluster.unsubscribe(self)
memberExitingProgress.trySuccess(Done)
super.postStop()
}
@ -634,11 +681,17 @@ class ClusterSingletonManager(
case Event(Terminated(ref), d @ OldestData(singleton, _)) if ref == singleton
stay using d.copy(singletonTerminated = true)
case Event(SelfExiting, _)
selfMemberExited()
// complete memberExitingProgress when handOverDone
sender() ! Done // reply to ask
stay
}
when(WasOldest) {
case Event(TakeOverRetry(count), WasOldestData(singleton, singletonTerminated, newOldestOption))
if (cluster.isTerminated && (newOldestOption.isEmpty || count > maxTakeOverRetries)) {
if ((cluster.isTerminated || selfExited) && (newOldestOption.isEmpty || count > maxTakeOverRetries)) {
if (singletonTerminated) stop()
else gotoStopping(singleton)
} else if (count <= maxTakeOverRetries) {
@ -663,6 +716,12 @@ class ClusterSingletonManager(
case Event(Terminated(ref), d @ WasOldestData(singleton, _, _)) if ref == singleton
stay using d.copy(singletonTerminated = true)
case Event(SelfExiting, _)
selfMemberExited()
// complete memberExitingProgress when handOverDone
sender() ! Done // reply to ask
stay
}
def gotoHandingOver(singleton: ActorRef, singletonTerminated: Boolean, handOverTo: Option[ActorRef]): State = {
@ -684,12 +743,18 @@ class ClusterSingletonManager(
sender() ! HandOverInProgress
stay
case Event(SelfExiting, _)
selfMemberExited()
// complete memberExitingProgress when handOverDone
sender() ! Done // reply to ask
stay
}
def handOverDone(handOverTo: Option[ActorRef]): State = {
val newOldest = handOverTo.map(_.path.address)
logInfo("Singleton terminated, hand-over done [{} -> {}]", cluster.selfAddress, newOldest)
handOverTo foreach { _ ! HandOverDone }
memberExitingProgress.trySuccess(Done)
if (removed.contains(cluster.selfUniqueAddress)) {
logInfo("Self removed, stopping ClusterSingletonManager")
stop()
@ -715,12 +780,16 @@ class ClusterSingletonManager(
stop()
}
def selfMemberExited(): Unit = {
selfExited = true
logInfo("Exited [{}]", cluster.selfAddress)
}
whenUnhandled {
case Event(MemberExited(m), _)
if (m.uniqueAddress == cluster.selfUniqueAddress) {
selfExited = true
logInfo("Exited [{}]", m.address)
}
case Event(SelfExiting, _)
selfMemberExited()
memberExitingProgress.trySuccess(Done)
sender() ! Done // reply to ask
stay
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress && !selfExited
logInfo("Self removed, stopping ClusterSingletonManager")