* CoordinatedShutdown that can run tasks for configured phases in order (DAG) * coordinate handover/shutdown of singleton with cluster exiting/shutdown * phase config obj with depends-on list * integrate graceful leaving of sharding in coordinated shutdown * add timeout and recover * add some missing artery ports to tests * leave via CoordinatedShutdown.run * optionally exit-jvm in last phase * run via jvm shutdown hook * send ExitingConfirmed to leader before shutdown of Exiting to not have to wait for failure detector to mark it as unreachable before removing * the unreachable signal is still kept as a safe guard if message is lost or leader dies * PhaseClusterExiting vs MemberExited in ClusterSingletonManager * terminate ActorSystem when cluster shutdown (via Down) * add more predefined and custom phases * reference documentation * migration guide * problem when the leader order was sys2, sys1, sys3, then sys3 could not perform it's duties and move Leving sys1 to Exiting because it was observing sys1 as unreachable * exclude Leaving with exitingConfirmed from convergence condidtion
837 lines
32 KiB
Scala
837 lines
32 KiB
Scala
/**
|
|
* Copyright (C) 2009-2017 Lightbend Inc. <http://www.lightbend.com>
|
|
*/
|
|
|
|
package akka.cluster.singleton
|
|
|
|
import com.typesafe.config.Config
|
|
import scala.concurrent.duration._
|
|
import scala.collection.immutable
|
|
import akka.actor.Actor
|
|
import akka.actor.Deploy
|
|
import akka.actor.ActorSystem
|
|
import akka.actor.ActorRef
|
|
import akka.actor.ActorSelection
|
|
import akka.actor.Address
|
|
import akka.actor.DeadLetterSuppression
|
|
import akka.actor.FSM
|
|
import akka.actor.Props
|
|
import akka.actor.Terminated
|
|
import akka.cluster.Cluster
|
|
import akka.cluster.ClusterEvent._
|
|
import akka.cluster.Member
|
|
import akka.cluster.MemberStatus
|
|
import akka.AkkaException
|
|
import akka.actor.NoSerializationVerificationNeeded
|
|
import akka.cluster.UniqueAddress
|
|
import akka.cluster.ClusterEvent
|
|
import scala.concurrent.Promise
|
|
import akka.Done
|
|
import akka.actor.CoordinatedShutdown
|
|
import akka.pattern.ask
|
|
import akka.util.Timeout
|
|
|
|
object ClusterSingletonManagerSettings {
|
|
|
|
/**
|
|
* Create settings from the default configuration
|
|
* `akka.cluster.singleton`.
|
|
*/
|
|
def apply(system: ActorSystem): ClusterSingletonManagerSettings =
|
|
apply(system.settings.config.getConfig("akka.cluster.singleton"))
|
|
.withRemovalMargin(Cluster(system).settings.DownRemovalMargin)
|
|
|
|
/**
|
|
* Create settings from a configuration with the same layout as
|
|
* the default configuration `akka.cluster.singleton`.
|
|
*/
|
|
def apply(config: Config): ClusterSingletonManagerSettings =
|
|
new ClusterSingletonManagerSettings(
|
|
singletonName = config.getString("singleton-name"),
|
|
role = roleOption(config.getString("role")),
|
|
removalMargin = Duration.Zero, // defaults to ClusterSettins.DownRemovalMargin
|
|
handOverRetryInterval = config.getDuration("hand-over-retry-interval", MILLISECONDS).millis)
|
|
|
|
/**
|
|
* Java API: Create settings from the default configuration
|
|
* `akka.cluster.singleton`.
|
|
*/
|
|
def create(system: ActorSystem): ClusterSingletonManagerSettings = apply(system)
|
|
|
|
/**
|
|
* Java API: Create settings from a configuration with the same layout as
|
|
* the default configuration `akka.cluster.singleton`.
|
|
*/
|
|
def create(config: Config): ClusterSingletonManagerSettings = apply(config)
|
|
|
|
/**
|
|
* INTERNAL API
|
|
*/
|
|
private[akka] def roleOption(role: String): Option[String] =
|
|
if (role == "") None else Option(role)
|
|
|
|
}
|
|
|
|
/**
|
|
* @param singletonName The actor name of the child singleton actor.
|
|
*
|
|
* @param role Singleton among the nodes tagged with specified role.
|
|
* If the role is not specified it's a singleton among all nodes in
|
|
* the cluster.
|
|
*
|
|
* @param removalMargin Margin until the singleton instance that belonged to
|
|
* a downed/removed partition is created in surviving partition. The purpose of
|
|
* this margin is that in case of a network partition the singleton actors
|
|
* in the non-surviving partitions must be stopped before corresponding actors
|
|
* are started somewhere else. This is especially important for persistent
|
|
* actors.
|
|
*
|
|
* @param handOverRetryInterval When a node is becoming oldest it sends hand-over
|
|
* request to previous oldest, that might be leaving the cluster. This is
|
|
* retried with this interval until the previous oldest confirms that the hand
|
|
* over has started or the previous oldest member is removed from the cluster
|
|
* (+ `removalMargin`).
|
|
*/
|
|
final class ClusterSingletonManagerSettings(
|
|
val singletonName: String,
|
|
val role: Option[String],
|
|
val removalMargin: FiniteDuration,
|
|
val handOverRetryInterval: FiniteDuration) extends NoSerializationVerificationNeeded {
|
|
|
|
def withSingletonName(name: String): ClusterSingletonManagerSettings = copy(singletonName = name)
|
|
|
|
def withRole(role: String): ClusterSingletonManagerSettings = copy(role = ClusterSingletonManagerSettings.roleOption(role))
|
|
|
|
def withRole(role: Option[String]) = copy(role = role)
|
|
|
|
def withRemovalMargin(removalMargin: FiniteDuration): ClusterSingletonManagerSettings =
|
|
copy(removalMargin = removalMargin)
|
|
|
|
def withHandOverRetryInterval(retryInterval: FiniteDuration): ClusterSingletonManagerSettings =
|
|
copy(handOverRetryInterval = retryInterval)
|
|
|
|
private def copy(
|
|
singletonName: String = singletonName,
|
|
role: Option[String] = role,
|
|
removalMargin: FiniteDuration = removalMargin,
|
|
handOverRetryInterval: FiniteDuration = handOverRetryInterval): ClusterSingletonManagerSettings =
|
|
new ClusterSingletonManagerSettings(singletonName, role, removalMargin, handOverRetryInterval)
|
|
}
|
|
|
|
/**
|
|
* Marker trait for remote messages with special serializer.
|
|
*/
|
|
sealed trait ClusterSingletonMessage extends Serializable
|
|
|
|
object ClusterSingletonManager {
|
|
|
|
/**
|
|
* Scala API: Factory method for `ClusterSingletonManager` [[akka.actor.Props]].
|
|
*/
|
|
def props(
|
|
singletonProps: Props,
|
|
terminationMessage: Any,
|
|
settings: ClusterSingletonManagerSettings): Props =
|
|
Props(new ClusterSingletonManager(singletonProps, terminationMessage, settings)).withDeploy(Deploy.local)
|
|
|
|
/**
|
|
* INTERNAL API
|
|
* public due to the `with FSM` type parameters
|
|
*/
|
|
sealed trait State
|
|
/**
|
|
* INTERNAL API
|
|
* public due to the `with FSM` type parameters
|
|
*/
|
|
sealed trait Data
|
|
|
|
/**
|
|
* INTERNAL API
|
|
*/
|
|
private[akka] object Internal {
|
|
/**
|
|
* Sent from new oldest to previous oldest to initiate the
|
|
* hand-over process. `HandOverInProgress` and `HandOverDone`
|
|
* are expected replies.
|
|
*/
|
|
case object HandOverToMe extends ClusterSingletonMessage with DeadLetterSuppression
|
|
/**
|
|
* Confirmation by the previous oldest that the hand
|
|
* over process, shut down of the singleton actor, has
|
|
* started.
|
|
*/
|
|
case object HandOverInProgress extends ClusterSingletonMessage
|
|
/**
|
|
* Confirmation by the previous oldest that the singleton
|
|
* actor has been terminated and the hand-over process is
|
|
* completed.
|
|
*/
|
|
case object HandOverDone extends ClusterSingletonMessage
|
|
/**
|
|
* Sent from from previous oldest to new oldest to
|
|
* initiate the normal hand-over process.
|
|
* Especially useful when new node joins and becomes
|
|
* oldest immediately, without knowing who was previous
|
|
* oldest.
|
|
*/
|
|
case object TakeOverFromMe extends ClusterSingletonMessage with DeadLetterSuppression
|
|
|
|
final case class HandOverRetry(count: Int)
|
|
final case class TakeOverRetry(count: Int)
|
|
case object Cleanup
|
|
case object StartOldestChangedBuffer
|
|
|
|
case object Start extends State
|
|
case object Oldest extends State
|
|
case object Younger extends State
|
|
case object BecomingOldest extends State
|
|
case object WasOldest extends State
|
|
case object HandingOver extends State
|
|
case object TakeOver extends State
|
|
case object Stopping extends State
|
|
case object End extends State
|
|
|
|
case object Uninitialized extends Data
|
|
final case class YoungerData(oldestOption: Option[UniqueAddress]) extends Data
|
|
final case class BecomingOldestData(previousOldestOption: Option[UniqueAddress]) extends Data
|
|
final case class OldestData(singleton: ActorRef, singletonTerminated: Boolean = false) extends Data
|
|
final case class WasOldestData(singleton: ActorRef, singletonTerminated: Boolean,
|
|
newOldestOption: Option[UniqueAddress]) extends Data
|
|
final case class HandingOverData(singleton: ActorRef, handOverTo: Option[ActorRef]) extends Data
|
|
final case class StoppingData(singleton: ActorRef) extends Data
|
|
case object EndData extends Data
|
|
final case class DelayedMemberRemoved(member: Member)
|
|
case object SelfExiting
|
|
|
|
val HandOverRetryTimer = "hand-over-retry"
|
|
val TakeOverRetryTimer = "take-over-retry"
|
|
val CleanupTimer = "cleanup"
|
|
|
|
object OldestChangedBuffer {
|
|
/**
|
|
* Request to deliver one more event.
|
|
*/
|
|
case object GetNext
|
|
/**
|
|
* The first event, corresponding to CurrentClusterState.
|
|
*/
|
|
final case class InitialOldestState(oldest: Option[UniqueAddress], safeToBeOldest: Boolean)
|
|
|
|
final case class OldestChanged(oldest: Option[UniqueAddress])
|
|
}
|
|
|
|
/**
|
|
* Notifications of member events that track oldest member is tunneled
|
|
* via this actor (child of ClusterSingletonManager) to be able to deliver
|
|
* one change at a time. Avoiding simultaneous changes simplifies
|
|
* the process in ClusterSingletonManager. ClusterSingletonManager requests
|
|
* next event with `GetNext` when it is ready for it. Only one outstanding
|
|
* `GetNext` request is allowed. Incoming events are buffered and delivered
|
|
* upon `GetNext` request.
|
|
*/
|
|
class OldestChangedBuffer(role: Option[String]) extends Actor {
|
|
import OldestChangedBuffer._
|
|
|
|
val cluster = Cluster(context.system)
|
|
// sort by age, oldest first
|
|
val ageOrdering = Member.ageOrdering
|
|
var membersByAge: immutable.SortedSet[Member] = immutable.SortedSet.empty(ageOrdering)
|
|
|
|
var changes = Vector.empty[AnyRef]
|
|
|
|
// subscribe to MemberEvent, re-subscribe when restart
|
|
override def preStart(): Unit = {
|
|
cluster.subscribe(self, classOf[MemberEvent])
|
|
|
|
// It's a delicate difference between CoordinatedShutdown.PhaseClusterExiting and MemberExited.
|
|
// MemberExited event is published immediately (leader may have performed that transition on other node),
|
|
// and that will trigger run of CoordinatedShutdown, while PhaseClusterExiting will happen later.
|
|
// Using PhaseClusterExiting in the singleton because the graceful shutdown of sharding region
|
|
// should preferably complete before stopping the singleton sharding coordinator on same node.
|
|
val coordShutdown = CoordinatedShutdown(context.system)
|
|
coordShutdown.addTask(CoordinatedShutdown.PhaseClusterExiting, "singleton-exiting-1") { () ⇒
|
|
implicit val timeout = Timeout(coordShutdown.timeout(CoordinatedShutdown.PhaseClusterExiting))
|
|
self.ask(SelfExiting).mapTo[Done]
|
|
}
|
|
}
|
|
override def postStop(): Unit = cluster.unsubscribe(self)
|
|
|
|
def matchingRole(member: Member): Boolean = role match {
|
|
case None ⇒ true
|
|
case Some(r) ⇒ member.hasRole(r)
|
|
}
|
|
|
|
def trackChange(block: () ⇒ Unit): Unit = {
|
|
val before = membersByAge.headOption
|
|
block()
|
|
val after = membersByAge.headOption
|
|
if (before != after)
|
|
changes :+= OldestChanged(after.map(_.uniqueAddress))
|
|
}
|
|
|
|
def handleInitial(state: CurrentClusterState): Unit = {
|
|
membersByAge = immutable.SortedSet.empty(ageOrdering) union state.members.filter(m ⇒
|
|
(m.status == MemberStatus.Up || m.status == MemberStatus.Leaving) && matchingRole(m))
|
|
val safeToBeOldest = !state.members.exists { m ⇒ (m.status == MemberStatus.Down || m.status == MemberStatus.Exiting) }
|
|
val initial = InitialOldestState(membersByAge.headOption.map(_.uniqueAddress), safeToBeOldest)
|
|
changes :+= initial
|
|
}
|
|
|
|
def add(m: Member): Unit = {
|
|
if (matchingRole(m))
|
|
trackChange { () ⇒
|
|
// replace, it's possible that the upNumber is changed
|
|
membersByAge = membersByAge.filterNot(_.uniqueAddress == m.uniqueAddress)
|
|
membersByAge += m
|
|
}
|
|
}
|
|
|
|
def remove(m: Member): Unit = {
|
|
if (matchingRole(m))
|
|
trackChange { () ⇒
|
|
membersByAge = membersByAge.filterNot(_.uniqueAddress == m.uniqueAddress)
|
|
}
|
|
}
|
|
|
|
def sendFirstChange(): Unit = {
|
|
val event = changes.head
|
|
changes = changes.tail
|
|
context.parent ! event
|
|
}
|
|
|
|
def receive = {
|
|
case state: CurrentClusterState ⇒ handleInitial(state)
|
|
case MemberUp(m) ⇒ add(m)
|
|
case MemberRemoved(m, _) ⇒ remove(m)
|
|
case MemberExited(m) if m.uniqueAddress != cluster.selfUniqueAddress ⇒
|
|
remove(m)
|
|
case SelfExiting ⇒
|
|
remove(cluster.readView.self)
|
|
sender() ! Done // reply to ask
|
|
case GetNext if changes.isEmpty ⇒
|
|
context.become(deliverNext, discardOld = false)
|
|
case GetNext ⇒
|
|
sendFirstChange()
|
|
}
|
|
|
|
// the buffer was empty when GetNext was received, deliver next event immediately
|
|
def deliverNext: Actor.Receive = {
|
|
case state: CurrentClusterState ⇒
|
|
handleInitial(state)
|
|
sendFirstChange()
|
|
context.unbecome()
|
|
case MemberUp(m) ⇒
|
|
add(m)
|
|
deliverChanges
|
|
case MemberRemoved(m, _) ⇒
|
|
remove(m)
|
|
deliverChanges()
|
|
case MemberExited(m) if m.uniqueAddress != cluster.selfUniqueAddress ⇒
|
|
remove(m)
|
|
deliverChanges()
|
|
case SelfExiting ⇒
|
|
remove(cluster.readView.self)
|
|
deliverChanges()
|
|
sender() ! Done // reply to ask
|
|
}
|
|
|
|
def deliverChanges(): Unit = {
|
|
if (changes.nonEmpty) {
|
|
sendFirstChange()
|
|
context.unbecome()
|
|
}
|
|
}
|
|
|
|
override def unhandled(msg: Any): Unit = {
|
|
msg match {
|
|
case _: MemberEvent ⇒ // ok, silence
|
|
case _ ⇒ super.unhandled(msg)
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Thrown when a consistent state can't be determined within the
|
|
* defined retry limits. Eventually it will reach a stable state and
|
|
* can continue, and that is simplified by starting over with a clean
|
|
* state. Parent supervisor should typically restart the actor, i.e.
|
|
* default decision.
|
|
*/
|
|
class ClusterSingletonManagerIsStuck(message: String) extends AkkaException(message, null)
|
|
|
|
/**
|
|
* Manages singleton actor instance among all cluster nodes or a group
|
|
* of nodes tagged with a specific role. At most one singleton instance
|
|
* is running at any point in time.
|
|
*
|
|
* The ClusterSingletonManager is supposed to be started on all nodes,
|
|
* or all nodes with specified role, in the cluster with `actorOf`.
|
|
* The actual singleton is started on the oldest node by creating a child
|
|
* actor from the supplied `singletonProps`.
|
|
*
|
|
* The singleton actor is always running on the oldest member with specified role.
|
|
* The oldest member is determined by [[akka.cluster.Member#isOlderThan]].
|
|
* This can change when removing members. A graceful hand over can normally
|
|
* be performed when current oldest node is leaving the cluster. Be aware that
|
|
* there is a short time period when there is no active singleton during the
|
|
* hand-over process.
|
|
*
|
|
* The cluster failure detector will notice when oldest node
|
|
* becomes unreachable due to things like JVM crash, hard shut down,
|
|
* or network failure. When the crashed node has been removed (via down) from the
|
|
* cluster then a new oldest node will take over and a new singleton actor is
|
|
* created. For these failure scenarios there will not be a graceful hand-over,
|
|
* but more than one active singletons is prevented by all reasonable means. Some
|
|
* corner cases are eventually resolved by configurable timeouts.
|
|
*
|
|
* You access the singleton actor with [[ClusterSingletonProxy]].
|
|
* Alternatively the singleton actor may broadcast its existence when it is started.
|
|
*
|
|
* Use factory method [[ClusterSingletonManager#props]] to create the
|
|
* [[akka.actor.Props]] for the actor.
|
|
*
|
|
*
|
|
* @param singletonProps [[akka.actor.Props]] of the singleton actor instance.
|
|
*
|
|
* @param terminationMessage When handing over to a new oldest node
|
|
* this `terminationMessage` is sent to the singleton actor to tell
|
|
* it to finish its work, close resources, and stop.
|
|
* The hand-over to the new oldest node is completed when the
|
|
* singleton actor is terminated.
|
|
* Note that [[akka.actor.PoisonPill]] is a perfectly fine
|
|
* `terminationMessage` if you only need to stop the actor.
|
|
*
|
|
* @param settings see [[ClusterSingletonManagerSettings]]
|
|
*/
|
|
class ClusterSingletonManager(
|
|
singletonProps: Props,
|
|
terminationMessage: Any,
|
|
settings: ClusterSingletonManagerSettings)
|
|
extends Actor with FSM[ClusterSingletonManager.State, ClusterSingletonManager.Data] {
|
|
|
|
import ClusterSingletonManager.Internal._
|
|
import ClusterSingletonManager.Internal.OldestChangedBuffer._
|
|
import settings._
|
|
import FSM.`→`
|
|
|
|
val cluster = Cluster(context.system)
|
|
val selfUniqueAddressOption = Some(cluster.selfUniqueAddress)
|
|
import cluster.settings.LogInfo
|
|
|
|
require(
|
|
role.forall(cluster.selfRoles.contains),
|
|
s"This cluster member [${cluster.selfAddress}] doesn't have the role [$role]")
|
|
|
|
val removalMargin =
|
|
if (settings.removalMargin <= Duration.Zero) cluster.downingProvider.downRemovalMargin
|
|
else settings.removalMargin
|
|
|
|
val (maxHandOverRetries, maxTakeOverRetries) = {
|
|
val n = (removalMargin.toMillis / handOverRetryInterval.toMillis).toInt
|
|
val minRetries = context.system.settings.config.getInt(
|
|
"akka.cluster.singleton.min-number-of-hand-over-retries")
|
|
require(minRetries >= 1, "min-number-of-hand-over-retries must be >= 1")
|
|
val handOverRetries = math.max(minRetries, n + 3)
|
|
val takeOverRetries = math.max(1, handOverRetries - 3)
|
|
|
|
(handOverRetries, takeOverRetries)
|
|
}
|
|
|
|
// started when when self member is Up
|
|
var oldestChangedBuffer: ActorRef = _
|
|
// Previous GetNext request delivered event and new GetNext is to be sent
|
|
var oldestChangedReceived = true
|
|
|
|
var selfExited = false
|
|
|
|
// keep track of previously removed members
|
|
var removed = Map.empty[UniqueAddress, Deadline]
|
|
|
|
def addRemoved(node: UniqueAddress): Unit =
|
|
removed += node → (Deadline.now + 15.minutes)
|
|
|
|
def cleanupOverdueNotMemberAnyMore(): Unit = {
|
|
removed = removed filter { case (_, deadline) ⇒ deadline.hasTimeLeft }
|
|
}
|
|
|
|
// for CoordinatedShutdown
|
|
val coordShutdown = CoordinatedShutdown(context.system)
|
|
val memberExitingProgress = Promise[Done]()
|
|
coordShutdown.addTask(CoordinatedShutdown.PhaseClusterExiting, "wait-singleton-exiting")(() ⇒
|
|
memberExitingProgress.future)
|
|
coordShutdown.addTask(CoordinatedShutdown.PhaseClusterExiting, "singleton-exiting-2") { () ⇒
|
|
implicit val timeout = Timeout(coordShutdown.timeout(CoordinatedShutdown.PhaseClusterExiting))
|
|
self.ask(SelfExiting).mapTo[Done]
|
|
}
|
|
|
|
def logInfo(message: String): Unit =
|
|
if (LogInfo) log.info(message)
|
|
|
|
def logInfo(template: String, arg1: Any): Unit =
|
|
if (LogInfo) log.info(template, arg1)
|
|
|
|
def logInfo(template: String, arg1: Any, arg2: Any): Unit =
|
|
if (LogInfo) log.info(template, arg1, arg2)
|
|
|
|
override def preStart(): Unit = {
|
|
super.preStart()
|
|
require(!cluster.isTerminated, "Cluster node must not be terminated")
|
|
|
|
// subscribe to cluster changes, re-subscribe when restart
|
|
cluster.subscribe(self, ClusterEvent.InitialStateAsEvents, classOf[MemberRemoved])
|
|
|
|
setTimer(CleanupTimer, Cleanup, 1.minute, repeat = true)
|
|
|
|
// defer subscription to avoid some jitter when
|
|
// starting/joining several nodes at the same time
|
|
cluster.registerOnMemberUp(self ! StartOldestChangedBuffer)
|
|
}
|
|
|
|
override def postStop(): Unit = {
|
|
cancelTimer(CleanupTimer)
|
|
cluster.unsubscribe(self)
|
|
memberExitingProgress.trySuccess(Done)
|
|
super.postStop()
|
|
}
|
|
|
|
def peer(at: Address): ActorSelection = context.actorSelection(self.path.toStringWithAddress(at))
|
|
|
|
def getNextOldestChanged(): Unit =
|
|
if (oldestChangedReceived) {
|
|
oldestChangedReceived = false
|
|
oldestChangedBuffer ! GetNext
|
|
}
|
|
|
|
startWith(Start, Uninitialized)
|
|
|
|
when(Start) {
|
|
case Event(StartOldestChangedBuffer, _) ⇒
|
|
oldestChangedBuffer = context.actorOf(Props(classOf[OldestChangedBuffer], role).
|
|
withDispatcher(context.props.dispatcher))
|
|
getNextOldestChanged()
|
|
stay
|
|
|
|
case Event(InitialOldestState(oldestOption, safeToBeOldest), _) ⇒
|
|
oldestChangedReceived = true
|
|
if (oldestOption == selfUniqueAddressOption && safeToBeOldest)
|
|
// oldest immediately
|
|
gotoOldest()
|
|
else if (oldestOption == selfUniqueAddressOption)
|
|
goto(BecomingOldest) using BecomingOldestData(None)
|
|
else
|
|
goto(Younger) using YoungerData(oldestOption)
|
|
}
|
|
|
|
when(Younger) {
|
|
case Event(OldestChanged(oldestOption), YoungerData(previousOldestOption)) ⇒
|
|
oldestChangedReceived = true
|
|
if (oldestOption == selfUniqueAddressOption) {
|
|
logInfo("Younger observed OldestChanged: [{} -> myself]", previousOldestOption.map(_.address))
|
|
previousOldestOption match {
|
|
case None ⇒ gotoOldest()
|
|
case Some(prev) if removed.contains(prev) ⇒ gotoOldest()
|
|
case Some(prev) ⇒
|
|
peer(prev.address) ! HandOverToMe
|
|
goto(BecomingOldest) using BecomingOldestData(previousOldestOption)
|
|
}
|
|
} else {
|
|
logInfo("Younger observed OldestChanged: [{} -> {}]", previousOldestOption.map(_.address), oldestOption.map(_.address))
|
|
getNextOldestChanged()
|
|
stay using YoungerData(oldestOption)
|
|
}
|
|
|
|
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
|
|
logInfo("Self removed, stopping ClusterSingletonManager")
|
|
stop()
|
|
|
|
case Event(MemberRemoved(m, _), _) ⇒
|
|
scheduleDelayedMemberRemoved(m)
|
|
stay
|
|
|
|
case Event(DelayedMemberRemoved(m), YoungerData(Some(previousOldest))) if m.uniqueAddress == previousOldest ⇒
|
|
logInfo("Previous oldest removed [{}]", m.address)
|
|
addRemoved(m.uniqueAddress)
|
|
// transition when OldestChanged
|
|
stay using YoungerData(None)
|
|
|
|
case Event(HandOverToMe, _) ⇒
|
|
// this node was probably quickly restarted with same hostname:port,
|
|
// confirm that the old singleton instance has been stopped
|
|
sender() ! HandOverDone
|
|
stay
|
|
}
|
|
|
|
when(BecomingOldest) {
|
|
|
|
case Event(HandOverInProgress, _) ⇒
|
|
// confirmation that the hand-over process has started
|
|
logInfo("Hand-over in progress at [{}]", sender().path.address)
|
|
cancelTimer(HandOverRetryTimer)
|
|
stay
|
|
|
|
case Event(HandOverDone, BecomingOldestData(Some(previousOldest))) ⇒
|
|
if (sender().path.address == previousOldest.address)
|
|
gotoOldest()
|
|
else {
|
|
logInfo(
|
|
"Ignoring HandOverDone in BecomingOldest from [{}]. Expected previous oldest [{}]",
|
|
sender().path.address, previousOldest.address)
|
|
stay
|
|
}
|
|
|
|
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
|
|
logInfo("Self removed, stopping ClusterSingletonManager")
|
|
stop()
|
|
|
|
case Event(MemberRemoved(m, _), _) ⇒
|
|
scheduleDelayedMemberRemoved(m)
|
|
stay
|
|
|
|
case Event(DelayedMemberRemoved(m), BecomingOldestData(Some(previousOldest))) if m.uniqueAddress == previousOldest ⇒
|
|
logInfo("Previous oldest [{}] removed", previousOldest.address)
|
|
addRemoved(m.uniqueAddress)
|
|
gotoOldest()
|
|
|
|
case Event(TakeOverFromMe, BecomingOldestData(previousOldestOption)) ⇒
|
|
val senderAddress = sender().path.address
|
|
// it would have been better to include the UniqueAddress in the TakeOverFromMe message,
|
|
// but can't change due to backwards compatibility
|
|
cluster.state.members.collectFirst { case m if m.address == senderAddress ⇒ m.uniqueAddress } match {
|
|
case None ⇒
|
|
// from unknown node, ignore
|
|
logInfo(
|
|
"Ignoring TakeOver request from unknown node in BecomingOldest from [{}].", senderAddress)
|
|
stay
|
|
case Some(senderUniqueAddress) ⇒
|
|
previousOldestOption match {
|
|
case Some(previousOldest) ⇒
|
|
if (previousOldest == senderUniqueAddress) sender() ! HandOverToMe
|
|
else logInfo(
|
|
"Ignoring TakeOver request in BecomingOldest from [{}]. Expected previous oldest [{}]",
|
|
sender().path.address, previousOldest.address)
|
|
stay
|
|
case None ⇒
|
|
sender() ! HandOverToMe
|
|
stay using BecomingOldestData(Some(senderUniqueAddress))
|
|
}
|
|
}
|
|
|
|
case Event(HandOverRetry(count), BecomingOldestData(previousOldestOption)) ⇒
|
|
if (count <= maxHandOverRetries) {
|
|
logInfo("Retry [{}], sending HandOverToMe to [{}]", count, previousOldestOption.map(_.address))
|
|
previousOldestOption.foreach(node ⇒ peer(node.address) ! HandOverToMe)
|
|
setTimer(HandOverRetryTimer, HandOverRetry(count + 1), handOverRetryInterval, repeat = false)
|
|
stay()
|
|
} else if (previousOldestOption forall removed.contains) {
|
|
// can't send HandOverToMe, previousOldest unknown for new node (or restart)
|
|
// previous oldest might be down or removed, so no TakeOverFromMe message is received
|
|
logInfo("Timeout in BecomingOldest. Previous oldest unknown, removed and no TakeOver request.")
|
|
gotoOldest()
|
|
} else if (cluster.isTerminated)
|
|
stop()
|
|
else
|
|
throw new ClusterSingletonManagerIsStuck(
|
|
s"Becoming singleton oldest was stuck because previous oldest [${previousOldestOption}] is unresponsive")
|
|
}
|
|
|
|
def scheduleDelayedMemberRemoved(m: Member): Unit = {
|
|
if (removalMargin > Duration.Zero) {
|
|
log.debug("Schedule DelayedMemberRemoved for [{}]", m.address)
|
|
context.system.scheduler.scheduleOnce(removalMargin, self, DelayedMemberRemoved(m))(context.dispatcher)
|
|
} else
|
|
self ! DelayedMemberRemoved(m)
|
|
}
|
|
|
|
def gotoOldest(): State = {
|
|
val singleton = context watch context.actorOf(singletonProps, singletonName)
|
|
logInfo("Singleton manager starting singleton actor [{}]", singleton.path)
|
|
goto(Oldest) using OldestData(singleton)
|
|
}
|
|
|
|
when(Oldest) {
|
|
case Event(OldestChanged(oldestOption), OldestData(singleton, singletonTerminated)) ⇒
|
|
oldestChangedReceived = true
|
|
logInfo("Oldest observed OldestChanged: [{} -> {}]", cluster.selfAddress, oldestOption.map(_.address))
|
|
oldestOption match {
|
|
case Some(a) if a == cluster.selfUniqueAddress ⇒
|
|
// already oldest
|
|
stay
|
|
case Some(a) if !selfExited && removed.contains(a) ⇒
|
|
// The member removal was not completed and the old removed node is considered
|
|
// oldest again. Safest is to terminate the singleton instance and goto Younger.
|
|
// This node will become oldest again when the other is removed again.
|
|
gotoHandingOver(singleton, singletonTerminated, None)
|
|
case Some(a) ⇒
|
|
// send TakeOver request in case the new oldest doesn't know previous oldest
|
|
peer(a.address) ! TakeOverFromMe
|
|
setTimer(TakeOverRetryTimer, TakeOverRetry(1), handOverRetryInterval, repeat = false)
|
|
goto(WasOldest) using WasOldestData(singleton, singletonTerminated, newOldestOption = Some(a))
|
|
case None ⇒
|
|
// new oldest will initiate the hand-over
|
|
setTimer(TakeOverRetryTimer, TakeOverRetry(1), handOverRetryInterval, repeat = false)
|
|
goto(WasOldest) using WasOldestData(singleton, singletonTerminated, newOldestOption = None)
|
|
}
|
|
|
|
case Event(HandOverToMe, OldestData(singleton, singletonTerminated)) ⇒
|
|
gotoHandingOver(singleton, singletonTerminated, Some(sender()))
|
|
|
|
case Event(Terminated(ref), d @ OldestData(singleton, _)) if ref == singleton ⇒
|
|
stay using d.copy(singletonTerminated = true)
|
|
|
|
case Event(SelfExiting, _) ⇒
|
|
selfMemberExited()
|
|
// complete memberExitingProgress when handOverDone
|
|
sender() ! Done // reply to ask
|
|
stay
|
|
}
|
|
|
|
when(WasOldest) {
|
|
case Event(TakeOverRetry(count), WasOldestData(singleton, singletonTerminated, newOldestOption)) ⇒
|
|
if ((cluster.isTerminated || selfExited) && (newOldestOption.isEmpty || count > maxTakeOverRetries)) {
|
|
if (singletonTerminated) stop()
|
|
else gotoStopping(singleton)
|
|
} else if (count <= maxTakeOverRetries) {
|
|
logInfo("Retry [{}], sending TakeOverFromMe to [{}]", count, newOldestOption.map(_.address))
|
|
newOldestOption.foreach(node ⇒ peer(node.address) ! TakeOverFromMe)
|
|
setTimer(TakeOverRetryTimer, TakeOverRetry(count + 1), handOverRetryInterval, repeat = false)
|
|
stay
|
|
} else
|
|
throw new ClusterSingletonManagerIsStuck(s"Expected hand-over to [${newOldestOption}] never occured")
|
|
|
|
case Event(HandOverToMe, WasOldestData(singleton, singletonTerminated, _)) ⇒
|
|
gotoHandingOver(singleton, singletonTerminated, Some(sender()))
|
|
|
|
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress && !selfExited ⇒
|
|
logInfo("Self removed, stopping ClusterSingletonManager")
|
|
stop()
|
|
|
|
case Event(MemberRemoved(m, _), WasOldestData(singleton, singletonTerminated, Some(newOldest))) if !selfExited && m.uniqueAddress == newOldest ⇒
|
|
addRemoved(m.uniqueAddress)
|
|
gotoHandingOver(singleton, singletonTerminated, None)
|
|
|
|
case Event(Terminated(ref), d @ WasOldestData(singleton, _, _)) if ref == singleton ⇒
|
|
stay using d.copy(singletonTerminated = true)
|
|
|
|
case Event(SelfExiting, _) ⇒
|
|
selfMemberExited()
|
|
// complete memberExitingProgress when handOverDone
|
|
sender() ! Done // reply to ask
|
|
stay
|
|
|
|
}
|
|
|
|
def gotoHandingOver(singleton: ActorRef, singletonTerminated: Boolean, handOverTo: Option[ActorRef]): State = {
|
|
if (singletonTerminated) {
|
|
handOverDone(handOverTo)
|
|
} else {
|
|
handOverTo foreach { _ ! HandOverInProgress }
|
|
singleton ! terminationMessage
|
|
goto(HandingOver) using HandingOverData(singleton, handOverTo)
|
|
}
|
|
}
|
|
|
|
when(HandingOver) {
|
|
case (Event(Terminated(ref), HandingOverData(singleton, handOverTo))) if ref == singleton ⇒
|
|
handOverDone(handOverTo)
|
|
|
|
case Event(HandOverToMe, d @ HandingOverData(singleton, handOverTo)) if handOverTo == Some(sender()) ⇒
|
|
// retry
|
|
sender() ! HandOverInProgress
|
|
stay
|
|
|
|
case Event(SelfExiting, _) ⇒
|
|
selfMemberExited()
|
|
// complete memberExitingProgress when handOverDone
|
|
sender() ! Done // reply to ask
|
|
stay
|
|
}
|
|
|
|
def handOverDone(handOverTo: Option[ActorRef]): State = {
|
|
val newOldest = handOverTo.map(_.path.address)
|
|
logInfo("Singleton terminated, hand-over done [{} -> {}]", cluster.selfAddress, newOldest)
|
|
handOverTo foreach { _ ! HandOverDone }
|
|
memberExitingProgress.trySuccess(Done)
|
|
if (removed.contains(cluster.selfUniqueAddress)) {
|
|
logInfo("Self removed, stopping ClusterSingletonManager")
|
|
stop()
|
|
} else if (handOverTo.isEmpty)
|
|
goto(Younger) using YoungerData(None)
|
|
else
|
|
goto(End) using EndData
|
|
}
|
|
|
|
def gotoStopping(singleton: ActorRef): State = {
|
|
singleton ! terminationMessage
|
|
goto(Stopping) using StoppingData(singleton)
|
|
}
|
|
|
|
when(Stopping) {
|
|
case (Event(Terminated(ref), StoppingData(singleton))) if ref == singleton ⇒
|
|
stop()
|
|
}
|
|
|
|
when(End) {
|
|
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress ⇒
|
|
logInfo("Self removed, stopping ClusterSingletonManager")
|
|
stop()
|
|
}
|
|
|
|
def selfMemberExited(): Unit = {
|
|
selfExited = true
|
|
logInfo("Exited [{}]", cluster.selfAddress)
|
|
}
|
|
|
|
whenUnhandled {
|
|
case Event(SelfExiting, _) ⇒
|
|
selfMemberExited()
|
|
memberExitingProgress.trySuccess(Done)
|
|
sender() ! Done // reply to ask
|
|
stay
|
|
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress && !selfExited ⇒
|
|
logInfo("Self removed, stopping ClusterSingletonManager")
|
|
stop()
|
|
case Event(MemberRemoved(m, _), _) ⇒
|
|
if (!selfExited) logInfo("Member removed [{}]", m.address)
|
|
addRemoved(m.uniqueAddress)
|
|
stay
|
|
case Event(DelayedMemberRemoved(m), _) ⇒
|
|
if (!selfExited) logInfo("Member removed [{}]", m.address)
|
|
addRemoved(m.uniqueAddress)
|
|
stay
|
|
case Event(TakeOverFromMe, _) ⇒
|
|
logInfo("Ignoring TakeOver request in [{}] from [{}].", stateName, sender().path.address)
|
|
stay
|
|
case Event(Cleanup, _) ⇒
|
|
cleanupOverdueNotMemberAnyMore()
|
|
stay
|
|
}
|
|
|
|
onTransition {
|
|
case from → to ⇒ logInfo("ClusterSingletonManager state change [{} -> {}]", from, to)
|
|
}
|
|
|
|
onTransition {
|
|
case _ → BecomingOldest ⇒ setTimer(HandOverRetryTimer, HandOverRetry(1), handOverRetryInterval, repeat = false)
|
|
}
|
|
|
|
onTransition {
|
|
case BecomingOldest → _ ⇒ cancelTimer(HandOverRetryTimer)
|
|
case WasOldest → _ ⇒ cancelTimer(TakeOverRetryTimer)
|
|
}
|
|
|
|
onTransition {
|
|
case _ → (Younger | Oldest) ⇒ getNextOldestChanged()
|
|
}
|
|
|
|
onTransition {
|
|
case _ → (Younger | End) if removed.contains(cluster.selfUniqueAddress) ⇒
|
|
logInfo("Self removed, stopping ClusterSingletonManager")
|
|
// note that FSM.stop() can't be used in onTransition
|
|
context.stop(self)
|
|
}
|
|
|
|
}
|