pekko/akka-cluster-tools/src/main/scala/akka/cluster/singleton/ClusterSingletonManager.scala

1088 lines
41 KiB
Scala
Raw Normal View History

/*
* Copyright (C) 2009-2019 Lightbend Inc. <https://www.lightbend.com>
*/
package akka.cluster.singleton
import com.typesafe.config.Config
import scala.concurrent.duration._
import scala.collection.immutable
import scala.concurrent.Future
import akka.actor.Actor
import akka.actor.Deploy
import akka.actor.ActorSystem
import akka.actor.ActorRef
import akka.actor.ActorSelection
import akka.actor.Address
import akka.actor.DeadLetterSuppression
import akka.actor.FSM
import akka.actor.Props
import akka.actor.Terminated
import akka.cluster._
import akka.cluster.ClusterEvent._
import akka.AkkaException
import akka.actor.NoSerializationVerificationNeeded
import akka.pattern.pipe
import akka.util.JavaDurationConverters._
import scala.concurrent.Promise
import akka.Done
import akka.actor.CoordinatedShutdown
import akka.annotation.DoNotInherit
import akka.coordination.lease.LeaseUsageSettings
import akka.pattern.ask
import akka.util.Timeout
import akka.coordination.lease.scaladsl.{ Lease, LeaseProvider }
import akka.dispatch.Dispatchers
import com.github.ghik.silencer.silent
import scala.util.control.NonFatal
object ClusterSingletonManagerSettings {
/**
* Create settings from the default configuration
* `akka.cluster.singleton`.
*/
@silent // deprecated setting
def apply(system: ActorSystem): ClusterSingletonManagerSettings =
apply(system.settings.config.getConfig("akka.cluster.singleton"))
2019-03-11 10:38:24 +01:00
// note that this setting has some additional logic inside the ClusterSingletonManager
// falling back to DowningProvider.downRemovalMargin if it is off/Zero
.withRemovalMargin(Cluster(system).settings.DownRemovalMargin)
/**
* Create settings from a configuration with the same layout as
* the default configuration `akka.cluster.singleton`.
*/
def apply(config: Config): ClusterSingletonManagerSettings = {
val lease = config.getString("use-lease") match {
2019-04-15 17:40:26 +02:00
case s if s.isEmpty => None
case leaseConfigPath =>
Some(new LeaseUsageSettings(leaseConfigPath, config.getDuration("lease-retry-interval").asScala))
}
2019-03-13 10:56:20 +01:00
new ClusterSingletonManagerSettings(
singletonName = config.getString("singleton-name"),
role = roleOption(config.getString("role")),
removalMargin = Duration.Zero, // defaults to ClusterSettings.DownRemovalMargin
handOverRetryInterval = config.getDuration("hand-over-retry-interval", MILLISECONDS).millis,
lease)
}
/**
* Java API: Create settings from the default configuration
* `akka.cluster.singleton`.
*/
def create(system: ActorSystem): ClusterSingletonManagerSettings = apply(system)
/**
* Java API: Create settings from a configuration with the same layout as
* the default configuration `akka.cluster.singleton`.
*/
def create(config: Config): ClusterSingletonManagerSettings = apply(config)
/**
* INTERNAL API
*/
private[akka] def roleOption(role: String): Option[String] =
if (role == "") None else Option(role)
}
/**
* @param singletonName The actor name of the child singleton actor.
*
* @param role Singleton among the nodes tagged with specified role.
* If the role is not specified it's a singleton among all nodes in
* the cluster.
*
* @param removalMargin Margin until the singleton instance that belonged to
* a downed/removed partition is created in surviving partition. The purpose of
* this margin is that in case of a network partition the singleton actors
* in the non-surviving partitions must be stopped before corresponding actors
* are started somewhere else. This is especially important for persistent
* actors.
*
* @param handOverRetryInterval When a node is becoming oldest it sends hand-over
* request to previous oldest, that might be leaving the cluster. This is
* retried with this interval until the previous oldest confirms that the hand
* over has started or the previous oldest member is removed from the cluster
* (+ `removalMargin`).
*
* @param leaseSettings LeaseSettings for acquiring before creating the singleton actor
*/
2019-03-13 10:56:20 +01:00
final class ClusterSingletonManagerSettings(
val singletonName: String,
val role: Option[String],
val removalMargin: FiniteDuration,
val handOverRetryInterval: FiniteDuration,
val leaseSettings: Option[LeaseUsageSettings])
2019-03-11 10:38:24 +01:00
extends NoSerializationVerificationNeeded {
// bin compat for akka 2.5.21
def this(
singletonName: String,
role: Option[String],
removalMargin: FiniteDuration,
handOverRetryInterval: FiniteDuration) =
this(singletonName, role, removalMargin, handOverRetryInterval, None)
def withSingletonName(name: String): ClusterSingletonManagerSettings = copy(singletonName = name)
2019-03-11 10:38:24 +01:00
def withRole(role: String): ClusterSingletonManagerSettings =
copy(role = ClusterSingletonManagerSettings.roleOption(role))
def withRole(role: Option[String]) = copy(role = role)
def withRemovalMargin(removalMargin: FiniteDuration): ClusterSingletonManagerSettings =
copy(removalMargin = removalMargin)
def withHandOverRetryInterval(retryInterval: FiniteDuration): ClusterSingletonManagerSettings =
copy(handOverRetryInterval = retryInterval)
def withLeaseSettings(leaseSettings: LeaseUsageSettings): ClusterSingletonManagerSettings =
copy(leaseSettings = Some(leaseSettings))
2019-03-13 10:56:20 +01:00
private def copy(
singletonName: String = singletonName,
role: Option[String] = role,
removalMargin: FiniteDuration = removalMargin,
handOverRetryInterval: FiniteDuration = handOverRetryInterval,
leaseSettings: Option[LeaseUsageSettings] = leaseSettings): ClusterSingletonManagerSettings =
new ClusterSingletonManagerSettings(singletonName, role, removalMargin, handOverRetryInterval, leaseSettings)
}
/**
* Marker trait for remote messages with special serializer.
*/
sealed trait ClusterSingletonMessage extends Serializable
object ClusterSingletonManager {
/**
* Scala API: Factory method for `ClusterSingletonManager` [[akka.actor.Props]].
*/
2019-03-11 10:38:24 +01:00
def props(singletonProps: Props, terminationMessage: Any, settings: ClusterSingletonManagerSettings): Props =
Props(new ClusterSingletonManager(singletonProps, terminationMessage, settings))
.withDispatcher(Dispatchers.InternalDispatcherId)
.withDeploy(Deploy.local)
/**
* INTERNAL API
* public due to the `with FSM` type parameters
*/
sealed trait State
2019-03-11 10:38:24 +01:00
/**
* INTERNAL API
* public due to the `with FSM` type parameters
*/
sealed trait Data
/**
* INTERNAL API
*/
private[akka] object Internal {
2019-03-11 10:38:24 +01:00
/**
* Sent from new oldest to previous oldest to initiate the
* hand-over process. `HandOverInProgress` and `HandOverDone`
* are expected replies.
*/
case object HandOverToMe extends ClusterSingletonMessage with DeadLetterSuppression
2019-03-11 10:38:24 +01:00
/**
* Confirmation by the previous oldest that the hand
* over process, shut down of the singleton actor, has
* started.
*/
case object HandOverInProgress extends ClusterSingletonMessage
2019-03-11 10:38:24 +01:00
/**
* Confirmation by the previous oldest that the singleton
* actor has been terminated and the hand-over process is
* completed.
*/
case object HandOverDone extends ClusterSingletonMessage
2019-03-11 10:38:24 +01:00
/**
* Sent from from previous oldest to new oldest to
* initiate the normal hand-over process.
* Especially useful when new node joins and becomes
* oldest immediately, without knowing who was previous
* oldest.
*/
case object TakeOverFromMe extends ClusterSingletonMessage with DeadLetterSuppression
2014-03-07 13:20:01 +01:00
final case class HandOverRetry(count: Int)
final case class TakeOverRetry(count: Int)
final case object LeaseRetry
case object Cleanup
case object StartOldestChangedBuffer
case object Start extends State
case object AcquiringLease extends State
case object Oldest extends State
case object Younger extends State
case object BecomingOldest extends State
case object WasOldest extends State
case object HandingOver extends State
case object TakeOver extends State
case object Stopping extends State
case object End extends State
case object Uninitialized extends Data
final case class YoungerData(oldestOption: Option[UniqueAddress]) extends Data
final case class BecomingOldestData(previousOldestOption: Option[UniqueAddress]) extends Data
final case class OldestData(singleton: Option[ActorRef]) extends Data
final case class WasOldestData(singleton: Option[ActorRef], newOldestOption: Option[UniqueAddress]) extends Data
2014-03-07 13:20:01 +01:00
final case class HandingOverData(singleton: ActorRef, handOverTo: Option[ActorRef]) extends Data
final case class StoppingData(singleton: ActorRef) extends Data
case object EndData extends Data
final case class DelayedMemberRemoved(member: Member)
case object SelfExiting
case class AcquiringLeaseData(leaseRequestInProgress: Boolean, singleton: Option[ActorRef]) extends Data
val HandOverRetryTimer = "hand-over-retry"
val TakeOverRetryTimer = "take-over-retry"
val CleanupTimer = "cleanup"
val LeaseRetryTimer = "lease-retry"
object OldestChangedBuffer {
2019-03-11 10:38:24 +01:00
/**
* Request to deliver one more event.
*/
case object GetNext
2019-03-11 10:38:24 +01:00
/**
* The first event, corresponding to CurrentClusterState.
*/
final case class InitialOldestState(oldest: Option[UniqueAddress], safeToBeOldest: Boolean)
final case class OldestChanged(oldest: Option[UniqueAddress])
}
final case class AcquireLeaseResult(holdingLease: Boolean) extends DeadLetterSuppression
final case class ReleaseLeaseResult(released: Boolean) extends DeadLetterSuppression
final case class AcquireLeaseFailure(t: Throwable) extends DeadLetterSuppression
final case class ReleaseLeaseFailure(t: Throwable) extends DeadLetterSuppression
final case class LeaseLost(reason: Option[Throwable]) extends DeadLetterSuppression
/**
* Notifications of member events that track oldest member are tunneled
* via this actor (child of ClusterSingletonManager) to be able to deliver
* one change at a time. Avoiding simultaneous changes simplifies
* the process in ClusterSingletonManager. ClusterSingletonManager requests
* next event with `GetNext` when it is ready for it. Only one outstanding
* `GetNext` request is allowed. Incoming events are buffered and delivered
* upon `GetNext` request.
*/
class OldestChangedBuffer(role: Option[String]) extends Actor {
import OldestChangedBuffer._
val cluster = Cluster(context.system)
// sort by age, oldest first
val ageOrdering = Member.ageOrdering
var membersByAge: immutable.SortedSet[Member] = immutable.SortedSet.empty(ageOrdering)
var changes = Vector.empty[AnyRef]
// subscribe to MemberEvent, re-subscribe when restart
override def preStart(): Unit = {
cluster.subscribe(self, classOf[MemberEvent])
// It's a delicate difference between CoordinatedShutdown.PhaseClusterExiting and MemberExited.
// MemberExited event is published immediately (leader may have performed that transition on other node),
// and that will trigger run of CoordinatedShutdown, while PhaseClusterExiting will happen later.
// Using PhaseClusterExiting in the singleton because the graceful shutdown of sharding region
// should preferably complete before stopping the singleton sharding coordinator on same node.
val coordShutdown = CoordinatedShutdown(context.system)
coordShutdown.addTask(CoordinatedShutdown.PhaseClusterExiting, "singleton-exiting-1") { () =>
if (cluster.isTerminated || cluster.selfMember.status == MemberStatus.Down) {
Future.successful(Done)
} else {
implicit val timeout = Timeout(coordShutdown.timeout(CoordinatedShutdown.PhaseClusterExiting))
self.ask(SelfExiting).mapTo[Done]
}
}
}
override def postStop(): Unit = cluster.unsubscribe(self)
private val selfDc = ClusterSettings.DcRolePrefix + cluster.settings.SelfDataCenter
def matchingRole(member: Member): Boolean =
2017-07-04 17:11:21 +02:00
member.hasRole(selfDc) && role.forall(member.hasRole)
def trackChange(block: () => Unit): Unit = {
val before = membersByAge.headOption
block()
val after = membersByAge.headOption
if (before != after)
changes :+= OldestChanged(after.map(_.uniqueAddress))
}
def handleInitial(state: CurrentClusterState): Unit = {
2019-03-11 10:38:24 +01:00
membersByAge = immutable.SortedSet
.empty(ageOrdering)
.union(state.members.filter(m => m.status == MemberStatus.Up && matchingRole(m)))
// If there is some removal in progress of an older node it's not safe to immediately become oldest,
// removal of younger nodes doesn't matter. Note that it can also be started via restart after
// ClusterSingletonManagerIsStuck.
2019-03-11 10:38:24 +01:00
val selfUpNumber = state.members
.collectFirst { case m if m.uniqueAddress == cluster.selfUniqueAddress => m.upNumber }
.getOrElse(Int.MaxValue)
val safeToBeOldest = !state.members.exists { m =>
m.upNumber <= selfUpNumber && matchingRole(m) && (m.status == MemberStatus.Down || m.status == MemberStatus.Exiting || m.status == MemberStatus.Leaving)
}
val initial = InitialOldestState(membersByAge.headOption.map(_.uniqueAddress), safeToBeOldest)
changes :+= initial
}
def add(m: Member): Unit = {
if (matchingRole(m))
trackChange { () =>
// replace, it's possible that the upNumber is changed
membersByAge = membersByAge.filterNot(_.uniqueAddress == m.uniqueAddress)
membersByAge += m
}
}
def remove(m: Member): Unit = {
if (matchingRole(m))
trackChange { () =>
membersByAge = membersByAge.filterNot(_.uniqueAddress == m.uniqueAddress)
}
}
def sendFirstChange(): Unit = {
// don't send cluster change events if this node is shutting its self down, just wait for SelfExiting
if (!cluster.isTerminated) {
val event = changes.head
changes = changes.tail
context.parent ! event
}
}
def receive = {
case state: CurrentClusterState => handleInitial(state)
case MemberUp(m) => add(m)
case MemberRemoved(m, _) => remove(m)
case MemberExited(m) if m.uniqueAddress != cluster.selfUniqueAddress =>
remove(m)
case SelfExiting =>
remove(cluster.readView.self)
sender() ! Done // reply to ask
case GetNext if changes.isEmpty =>
context.become(deliverNext, discardOld = false)
case GetNext =>
sendFirstChange()
}
// the buffer was empty when GetNext was received, deliver next event immediately
def deliverNext: Actor.Receive = {
case state: CurrentClusterState =>
handleInitial(state)
sendFirstChange()
context.unbecome()
case MemberUp(m) =>
add(m)
deliverChanges()
case MemberRemoved(m, _) =>
remove(m)
deliverChanges()
case MemberExited(m) if m.uniqueAddress != cluster.selfUniqueAddress =>
remove(m)
deliverChanges()
case SelfExiting =>
remove(cluster.readView.self)
deliverChanges()
sender() ! Done // reply to ask
}
def deliverChanges(): Unit = {
if (changes.nonEmpty) {
sendFirstChange()
context.unbecome()
}
}
override def unhandled(msg: Any): Unit = {
msg match {
case _: MemberEvent => // ok, silence
case _ => super.unhandled(msg)
}
}
}
}
}
/**
* Thrown when a consistent state can't be determined within the
* defined retry limits. Eventually it will reach a stable state and
* can continue, and that is simplified by starting over with a clean
* state. Parent supervisor should typically restart the actor, i.e.
* default decision.
*/
class ClusterSingletonManagerIsStuck(message: String) extends AkkaException(message, null)
/**
* Manages singleton actor instance among all cluster nodes or a group
* of nodes tagged with a specific role. At most one singleton instance
* is running at any point in time.
*
* The ClusterSingletonManager is supposed to be started on all nodes,
* or all nodes with specified role, in the cluster with `actorOf`.
* The actual singleton is started on the oldest node by creating a child
* actor from the supplied `singletonProps`.
*
2015-04-13 10:07:14 +02:00
* The singleton actor is always running on the oldest member with specified role.
* The oldest member is determined by [[akka.cluster.Member#isOlderThan]].
* This can change when removing members. A graceful hand over can normally
* be performed when current oldest node is leaving the cluster. Be aware that
* there is a short time period when there is no active singleton during the
* hand-over process.
*
* The cluster failure detector will notice when oldest node
* becomes unreachable due to things like JVM crash, hard shut down,
* or network failure. When the crashed node has been removed (via down) from the
* cluster then a new oldest node will take over and a new singleton actor is
* created. For these failure scenarios there will not be a graceful hand-over,
* but more than one active singletons is prevented by all reasonable means. Some
* corner cases are eventually resolved by configurable timeouts.
*
* You access the singleton actor with [[ClusterSingletonProxy]].
* Alternatively the singleton actor may broadcast its existence when it is started.
*
* Use factory method [[ClusterSingletonManager#props]] to create the
* [[akka.actor.Props]] for the actor.
*
* Not intended for subclassing by user code.
*
*
* @param singletonProps [[akka.actor.Props]] of the singleton actor instance.
*
* @param terminationMessage When handing over to a new oldest node
* this `terminationMessage` is sent to the singleton actor to tell
* it to finish its work, close resources, and stop.
* The hand-over to the new oldest node is completed when the
* singleton actor is terminated.
* Note that [[akka.actor.PoisonPill]] is a perfectly fine
* `terminationMessage` if you only need to stop the actor.
*
* @param settings see [[ClusterSingletonManagerSettings]]
*/
@DoNotInherit
2019-03-11 10:38:24 +01:00
class ClusterSingletonManager(singletonProps: Props, terminationMessage: Any, settings: ClusterSingletonManagerSettings)
extends Actor
with FSM[ClusterSingletonManager.State, ClusterSingletonManager.Data] {
import ClusterSingletonManager.Internal._
import ClusterSingletonManager.Internal.OldestChangedBuffer._
import settings._
val cluster = Cluster(context.system)
val selfUniqueAddressOption = Some(cluster.selfUniqueAddress)
import cluster.settings.LogInfo
2019-03-13 10:56:20 +01:00
require(
role.forall(cluster.selfRoles.contains),
s"This cluster member [${cluster.selfAddress}] doesn't have the role [$role]")
private val singletonLeaseName = s"${context.system.name}-singleton-${self.path}"
val lease: Option[Lease] = settings.leaseSettings.map(
settings =>
LeaseProvider(context.system)
.getLease(singletonLeaseName, settings.leaseImplementation, cluster.selfAddress.hostPort))
val leaseRetryInterval: FiniteDuration = settings.leaseSettings match {
case Some(s) => s.leaseRetryInterval
case None => 5.seconds // won't be used
}
val removalMargin =
if (settings.removalMargin <= Duration.Zero) cluster.downingProvider.downRemovalMargin
else settings.removalMargin
val (maxHandOverRetries, maxTakeOverRetries) = {
val n = (removalMargin.toMillis / handOverRetryInterval.toMillis).toInt
2019-03-11 10:38:24 +01:00
val minRetries = context.system.settings.config.getInt("akka.cluster.singleton.min-number-of-hand-over-retries")
require(minRetries >= 1, "min-number-of-hand-over-retries must be >= 1")
val handOverRetries = math.max(minRetries, n + 3)
val takeOverRetries = math.max(1, handOverRetries - 3)
(handOverRetries, takeOverRetries)
}
// started when when self member is Up
var oldestChangedBuffer: ActorRef = _
// Previous GetNext request delivered event and new GetNext is to be sent
var oldestChangedReceived = true
var selfExited = false
// keep track of previously removed members
var removed = Map.empty[UniqueAddress, Deadline]
def addRemoved(node: UniqueAddress): Unit =
removed += node -> (Deadline.now + 15.minutes)
def cleanupOverdueNotMemberAnyMore(): Unit = {
2019-03-11 10:38:24 +01:00
removed = removed.filter { case (_, deadline) => deadline.hasTimeLeft }
}
// for CoordinatedShutdown
val coordShutdown = CoordinatedShutdown(context.system)
val memberExitingProgress = Promise[Done]()
coordShutdown.addTask(CoordinatedShutdown.PhaseClusterExiting, "wait-singleton-exiting") { () =>
if (cluster.isTerminated || cluster.selfMember.status == MemberStatus.Down)
Future.successful(Done)
else
memberExitingProgress.future
}
coordShutdown.addTask(CoordinatedShutdown.PhaseClusterExiting, "singleton-exiting-2") { () =>
if (cluster.isTerminated || cluster.selfMember.status == MemberStatus.Down) {
Future.successful(Done)
} else {
implicit val timeout = Timeout(coordShutdown.timeout(CoordinatedShutdown.PhaseClusterExiting))
self.ask(SelfExiting).mapTo[Done]
}
}
def logInfo(message: String): Unit =
if (LogInfo) log.info(message)
def logInfo(template: String, arg1: Any): Unit =
if (LogInfo) log.info(template, arg1)
def logInfo(template: String, arg1: Any, arg2: Any): Unit =
if (LogInfo) log.info(template, arg1, arg2)
def logInfo(template: String, arg1: Any, arg2: Any, arg3: Any): Unit =
if (LogInfo) log.info(template, arg1, arg2, arg3)
override def preStart(): Unit = {
super.preStart()
require(!cluster.isTerminated, "Cluster node must not be terminated")
// subscribe to cluster changes, re-subscribe when restart
cluster.subscribe(self, ClusterEvent.InitialStateAsEvents, classOf[MemberRemoved], classOf[MemberDowned])
setTimer(CleanupTimer, Cleanup, 1.minute, repeat = true)
// defer subscription to avoid some jitter when
// starting/joining several nodes at the same time
cluster.registerOnMemberUp(self ! StartOldestChangedBuffer)
}
override def postStop(): Unit = {
cancelTimer(CleanupTimer)
cluster.unsubscribe(self)
memberExitingProgress.trySuccess(Done)
super.postStop()
}
def peer(at: Address): ActorSelection = context.actorSelection(self.path.toStringWithAddress(at))
def getNextOldestChanged(): Unit =
if (oldestChangedReceived) {
oldestChangedReceived = false
oldestChangedBuffer ! GetNext
}
startWith(Start, Uninitialized)
when(Start) {
case Event(StartOldestChangedBuffer, _) =>
2019-03-11 10:38:24 +01:00
oldestChangedBuffer =
context.actorOf(Props(classOf[OldestChangedBuffer], role).withDispatcher(context.props.dispatcher))
getNextOldestChanged()
stay
case Event(InitialOldestState(oldestOption, safeToBeOldest), _) =>
oldestChangedReceived = true
if (oldestOption == selfUniqueAddressOption && safeToBeOldest)
// oldest immediately
2019-03-29 13:18:28 +01:00
tryGotoOldest()
else if (oldestOption == selfUniqueAddressOption)
2019-03-11 10:38:24 +01:00
goto(BecomingOldest).using(BecomingOldestData(None))
else
2019-03-11 10:38:24 +01:00
goto(Younger).using(YoungerData(oldestOption))
}
when(Younger) {
case Event(OldestChanged(oldestOption), YoungerData(previousOldestOption)) =>
oldestChangedReceived = true
if (oldestOption == selfUniqueAddressOption) {
logInfo("Younger observed OldestChanged: [{} -> myself]", previousOldestOption.map(_.address))
previousOldestOption match {
2019-03-29 13:18:28 +01:00
case None => tryGotoOldest()
case Some(prev) if removed.contains(prev) => tryGotoOldest()
case Some(prev) =>
peer(prev.address) ! HandOverToMe
2019-03-11 10:38:24 +01:00
goto(BecomingOldest).using(BecomingOldestData(previousOldestOption))
}
} else {
2019-03-13 10:56:20 +01:00
logInfo(
"Younger observed OldestChanged: [{} -> {}]",
previousOldestOption.map(_.address),
oldestOption.map(_.address))
getNextOldestChanged()
2019-03-11 10:38:24 +01:00
stay.using(YoungerData(oldestOption))
}
case Event(MemberDowned(m), _) if m.uniqueAddress == cluster.selfUniqueAddress =>
logInfo("Self downed, stopping ClusterSingletonManager")
stop()
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress =>
logInfo("Self removed, stopping ClusterSingletonManager")
stop()
case Event(MemberRemoved(m, _), _) =>
scheduleDelayedMemberRemoved(m)
stay
case Event(DelayedMemberRemoved(m), YoungerData(Some(previousOldest))) if m.uniqueAddress == previousOldest =>
logInfo("Previous oldest removed [{}]", m.address)
addRemoved(m.uniqueAddress)
// transition when OldestChanged
2019-03-11 10:38:24 +01:00
stay.using(YoungerData(None))
case Event(HandOverToMe, _) =>
// this node was probably quickly restarted with same hostname:port,
// confirm that the old singleton instance has been stopped
sender() ! HandOverDone
stay
}
when(BecomingOldest) {
case Event(HandOverInProgress, _) =>
// confirmation that the hand-over process has started
logInfo("Hand-over in progress at [{}]", sender().path.address)
cancelTimer(HandOverRetryTimer)
stay
case Event(HandOverDone, BecomingOldestData(Some(previousOldest))) =>
if (sender().path.address == previousOldest.address)
2019-03-29 13:18:28 +01:00
tryGotoOldest()
else {
2019-03-13 10:56:20 +01:00
logInfo(
"Ignoring HandOverDone in BecomingOldest from [{}]. Expected previous oldest [{}]",
sender().path.address,
previousOldest.address)
stay
}
case Event(MemberDowned(m), _) if m.uniqueAddress == cluster.selfUniqueAddress =>
logInfo("Self downed, stopping ClusterSingletonManager")
stop()
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress =>
logInfo("Self removed, stopping ClusterSingletonManager")
stop()
case Event(MemberRemoved(m, _), _) =>
scheduleDelayedMemberRemoved(m)
stay
2019-03-11 10:38:24 +01:00
case Event(DelayedMemberRemoved(m), BecomingOldestData(Some(previousOldest)))
if m.uniqueAddress == previousOldest =>
logInfo("Previous oldest [{}] removed", previousOldest.address)
addRemoved(m.uniqueAddress)
2019-03-29 13:18:28 +01:00
tryGotoOldest()
case Event(TakeOverFromMe, BecomingOldestData(previousOldestOption)) =>
val senderAddress = sender().path.address
// it would have been better to include the UniqueAddress in the TakeOverFromMe message,
// but can't change due to backwards compatibility
cluster.state.members.collectFirst { case m if m.address == senderAddress => m.uniqueAddress } match {
case None =>
// from unknown node, ignore
2019-03-11 10:38:24 +01:00
logInfo("Ignoring TakeOver request from unknown node in BecomingOldest from [{}].", senderAddress)
stay
case Some(senderUniqueAddress) =>
previousOldestOption match {
case Some(previousOldest) =>
if (previousOldest == senderUniqueAddress) sender() ! HandOverToMe
2019-03-11 10:38:24 +01:00
else
2019-03-13 10:56:20 +01:00
logInfo(
"Ignoring TakeOver request in BecomingOldest from [{}]. Expected previous oldest [{}]",
sender().path.address,
previousOldest.address)
stay
case None =>
sender() ! HandOverToMe
2019-03-11 10:38:24 +01:00
stay.using(BecomingOldestData(Some(senderUniqueAddress)))
}
}
case Event(HandOverRetry(count), BecomingOldestData(previousOldestOption)) =>
if (count <= maxHandOverRetries) {
logInfo("Retry [{}], sending HandOverToMe to [{}]", count, previousOldestOption.map(_.address))
previousOldestOption.foreach(node => peer(node.address) ! HandOverToMe)
setTimer(HandOverRetryTimer, HandOverRetry(count + 1), handOverRetryInterval, repeat = false)
stay()
2019-03-11 10:38:24 +01:00
} else if (previousOldestOption.forall(removed.contains)) {
// can't send HandOverToMe, previousOldest unknown for new node (or restart)
// previous oldest might be down or removed, so no TakeOverFromMe message is received
logInfo("Timeout in BecomingOldest. Previous oldest unknown, removed and no TakeOver request.")
2019-03-29 13:18:28 +01:00
tryGotoOldest()
} else if (cluster.isTerminated)
stop()
else
throw new ClusterSingletonManagerIsStuck(
s"Becoming singleton oldest was stuck because previous oldest [$previousOldestOption] is unresponsive")
}
def scheduleDelayedMemberRemoved(m: Member): Unit = {
if (removalMargin > Duration.Zero) {
log.debug("Schedule DelayedMemberRemoved for [{}]", m.address)
context.system.scheduler.scheduleOnce(removalMargin, self, DelayedMemberRemoved(m))(context.dispatcher)
} else
self ! DelayedMemberRemoved(m)
}
def tryAcquireLease() = {
import context.dispatcher
pipe(lease.get.acquire(reason => self ! LeaseLost(reason)).map[Any](AcquireLeaseResult).recover {
case NonFatal(t) => AcquireLeaseFailure(t)
}).to(self)
goto(AcquiringLease).using(AcquiringLeaseData(leaseRequestInProgress = true, None))
}
// Try and go to oldest, taking the lease if needed
2019-03-29 13:18:28 +01:00
def tryGotoOldest(): State = {
// check if lease
lease match {
case None =>
2019-03-29 13:18:28 +01:00
gotoOldest()
case Some(_) =>
logInfo("Trying to acquire lease before starting singleton")
tryAcquireLease()
}
}
when(AcquiringLease) {
case Event(AcquireLeaseResult(result), _) =>
logInfo("Acquire lease result {}", result)
if (result) {
2019-03-29 13:18:28 +01:00
gotoOldest()
} else {
setTimer(LeaseRetryTimer, LeaseRetry, leaseRetryInterval)
stay.using(AcquiringLeaseData(leaseRequestInProgress = false, None))
}
case Event(Terminated(ref), AcquiringLeaseData(_, Some(singleton))) if ref == singleton =>
logInfo("Singleton actor terminated. Trying to acquire lease again before re-creating.")
// tryAcquireLease sets the state to None for singleton actor
tryAcquireLease()
case Event(AcquireLeaseFailure(t), _) =>
log.error(t, "failed to get lease (will be retried)")
setTimer(LeaseRetryTimer, LeaseRetry, leaseRetryInterval)
stay.using(AcquiringLeaseData(leaseRequestInProgress = false, None))
case Event(LeaseRetry, _) =>
// If lease was lost (so previous state was oldest) then we don't try and get the lease
// until the old singleton instance has been terminated so we know there isn't an
// instance in this case
tryAcquireLease()
case Event(OldestChanged(oldestOption), AcquiringLeaseData(_, singleton)) =>
handleOldestChanged(singleton, oldestOption)
case Event(HandOverToMe, AcquiringLeaseData(_, singleton)) =>
gotoHandingOver(singleton, Some(sender()))
case Event(TakeOverFromMe, _) =>
// already oldest, so confirm and continue like that
sender() ! HandOverToMe
stay
case Event(SelfExiting, _) =>
selfMemberExited()
// complete memberExitingProgress when handOverDone
sender() ! Done // reply to ask
stay
case Event(MemberDowned(m), _) if m.uniqueAddress == cluster.selfUniqueAddress =>
logInfo("Self downed, stopping ClusterSingletonManager")
stop()
}
2019-03-29 13:18:28 +01:00
def gotoOldest(): State = {
val singleton = context.watch(context.actorOf(singletonProps, singletonName))
logInfo("Singleton manager starting singleton actor [{}]", singleton.path)
goto(Oldest).using(OldestData(Some(singleton)))
}
def handleOldestChanged(singleton: Option[ActorRef], oldestOption: Option[UniqueAddress]) = {
oldestChangedReceived = true
logInfo("{} observed OldestChanged: [{} -> {}]", stateName, cluster.selfAddress, oldestOption.map(_.address))
oldestOption match {
case Some(a) if a == cluster.selfUniqueAddress =>
// already oldest
stay
case Some(a) if !selfExited && removed.contains(a) =>
// The member removal was not completed and the old removed node is considered
// oldest again. Safest is to terminate the singleton instance and goto Younger.
// This node will become oldest again when the other is removed again.
gotoHandingOver(singleton, None)
case Some(a) =>
// send TakeOver request in case the new oldest doesn't know previous oldest
peer(a.address) ! TakeOverFromMe
setTimer(TakeOverRetryTimer, TakeOverRetry(1), handOverRetryInterval, repeat = false)
goto(WasOldest).using(WasOldestData(singleton, newOldestOption = Some(a)))
case None =>
// new oldest will initiate the hand-over
setTimer(TakeOverRetryTimer, TakeOverRetry(1), handOverRetryInterval, repeat = false)
goto(WasOldest).using(WasOldestData(singleton, newOldestOption = None))
}
}
when(Oldest) {
case Event(OldestChanged(oldestOption), OldestData(singleton)) =>
handleOldestChanged(singleton, oldestOption)
case Event(HandOverToMe, OldestData(singleton)) =>
gotoHandingOver(singleton, Some(sender()))
case Event(TakeOverFromMe, _) =>
// already oldest, so confirm and continue like that
sender() ! HandOverToMe
stay
case Event(Terminated(ref), d @ OldestData(Some(singleton))) if ref == singleton =>
logInfo("Singleton actor [{}] was terminated", singleton.path)
stay.using(d.copy(singleton = None))
case Event(SelfExiting, _) =>
selfMemberExited()
// complete memberExitingProgress when handOverDone
sender() ! Done // reply to ask
stay
case Event(MemberDowned(m), OldestData(singleton)) if m.uniqueAddress == cluster.selfUniqueAddress =>
singleton match {
case Some(s) =>
logInfo("Self downed, stopping")
gotoStopping(s)
case None =>
logInfo("Self downed, stopping ClusterSingletonManager")
stop()
}
case Event(LeaseLost(reason), OldestData(singleton)) =>
log.warning("Lease has been lost. Reason: {}. Terminating singleton and trying to re-acquire lease", reason)
singleton match {
case Some(s) =>
s ! terminationMessage
goto(AcquiringLease).using(AcquiringLeaseData(leaseRequestInProgress = false, singleton))
case None =>
tryAcquireLease()
}
}
when(WasOldest) {
case Event(TakeOverRetry(count), WasOldestData(singleton, newOldestOption)) =>
if ((cluster.isTerminated || selfExited) && (newOldestOption.isEmpty || count > maxTakeOverRetries)) {
singleton match {
case Some(s) => gotoStopping(s)
case None => stop()
}
} else if (count <= maxTakeOverRetries) {
if (maxTakeOverRetries - count <= 3)
logInfo("Retry [{}], sending TakeOverFromMe to [{}]", count, newOldestOption.map(_.address))
else
log.debug("Retry [{}], sending TakeOverFromMe to [{}]", count, newOldestOption.map(_.address))
newOldestOption.foreach(node => peer(node.address) ! TakeOverFromMe)
setTimer(TakeOverRetryTimer, TakeOverRetry(count + 1), handOverRetryInterval, repeat = false)
stay
} else
throw new ClusterSingletonManagerIsStuck(s"Expected hand-over to [$newOldestOption] never occurred")
case Event(HandOverToMe, WasOldestData(singleton, _)) =>
gotoHandingOver(singleton, Some(sender()))
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress && !selfExited =>
logInfo("Self removed, stopping ClusterSingletonManager")
stop()
case Event(MemberRemoved(m, _), WasOldestData(singleton, Some(newOldest)))
2019-03-11 10:38:24 +01:00
if !selfExited && m.uniqueAddress == newOldest =>
addRemoved(m.uniqueAddress)
gotoHandingOver(singleton, None)
case Event(Terminated(ref), d @ WasOldestData(singleton, _)) if singleton.contains(ref) =>
logInfo("Singleton actor [{}] was terminated", ref.path)
stay.using(d.copy(singleton = None))
case Event(SelfExiting, _) =>
selfMemberExited()
// complete memberExitingProgress when handOverDone
sender() ! Done // reply to ask
stay
case Event(MemberDowned(m), WasOldestData(singleton, _)) if m.uniqueAddress == cluster.selfUniqueAddress =>
singleton match {
case None =>
logInfo("Self downed, stopping ClusterSingletonManager")
stop()
case Some(s) =>
logInfo("Self downed, stopping")
gotoStopping(s)
}
}
def gotoHandingOver(singleton: Option[ActorRef], handOverTo: Option[ActorRef]): State = {
singleton match {
case None =>
handOverDone(handOverTo)
case Some(s) =>
handOverTo.foreach { _ ! HandOverInProgress }
logInfo("Singleton manager stopping singleton actor [{}]", s.path)
s ! terminationMessage
goto(HandingOver).using(HandingOverData(s, handOverTo))
}
}
when(HandingOver) {
case Event(Terminated(ref), HandingOverData(singleton, handOverTo)) if ref == singleton =>
handOverDone(handOverTo)
case Event(HandOverToMe, HandingOverData(_, handOverTo)) if handOverTo.contains(sender()) =>
// retry
sender() ! HandOverInProgress
stay
case Event(SelfExiting, _) =>
selfMemberExited()
// complete memberExitingProgress when handOverDone
sender() ! Done // reply to ask
stay
}
def handOverDone(handOverTo: Option[ActorRef]): State = {
val newOldest = handOverTo.map(_.path.address)
logInfo("Singleton terminated, hand-over done [{} -> {}]", cluster.selfAddress, newOldest)
2019-03-11 10:38:24 +01:00
handOverTo.foreach { _ ! HandOverDone }
memberExitingProgress.trySuccess(Done)
if (removed.contains(cluster.selfUniqueAddress)) {
logInfo("Self removed, stopping ClusterSingletonManager")
stop()
} else if (handOverTo.isEmpty)
2019-03-11 10:38:24 +01:00
goto(Younger).using(YoungerData(None))
else
2019-03-11 10:38:24 +01:00
goto(End).using(EndData)
}
def gotoStopping(singleton: ActorRef): State = {
2019-02-06 09:05:48 +01:00
logInfo("Singleton manager stopping singleton actor [{}]", singleton.path)
singleton ! terminationMessage
2019-03-11 10:38:24 +01:00
goto(Stopping).using(StoppingData(singleton))
}
when(Stopping) {
case Event(Terminated(ref), StoppingData(singleton)) if ref == singleton =>
logInfo("Singleton actor [{}] was terminated", singleton.path)
stop()
}
when(End) {
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress =>
logInfo("Self removed, stopping ClusterSingletonManager")
stop()
case Event(_: OldestChanged | HandOverToMe, _) =>
// not interested anymore - waiting for removal
stay()
}
def selfMemberExited(): Unit = {
selfExited = true
logInfo("Exited [{}]", cluster.selfAddress)
}
whenUnhandled {
case Event(SelfExiting, _) =>
selfMemberExited()
memberExitingProgress.trySuccess(Done)
sender() ! Done // reply to ask
stay
case Event(MemberRemoved(m, _), _) if m.uniqueAddress == cluster.selfUniqueAddress && !selfExited =>
logInfo("Self removed, stopping ClusterSingletonManager")
stop()
case Event(MemberRemoved(m, _), _) =>
if (!selfExited) logInfo("Member removed [{}]", m.address)
addRemoved(m.uniqueAddress)
stay
case Event(DelayedMemberRemoved(m), _) =>
if (!selfExited) logInfo("Member removed [{}]", m.address)
addRemoved(m.uniqueAddress)
stay
case Event(TakeOverFromMe, _) =>
log.debug("Ignoring TakeOver request in [{}] from [{}].", stateName, sender().path.address)
stay
case Event(Cleanup, _) =>
cleanupOverdueNotMemberAnyMore()
stay
case Event(MemberDowned(m), _) =>
if (m.uniqueAddress == cluster.selfUniqueAddress)
logInfo("Self downed, waiting for removal")
stay
case Event(ReleaseLeaseFailure(t), _) =>
log.error(
t,
"Failed to release lease. Singleton may not be able to run on another node until lease timeout occurs")
stay
case Event(ReleaseLeaseResult(released), _) =>
if (released) {
logInfo("Lease released")
} else {
// TODO we could retry
log.error(
"Failed to release lease. Singleton may not be able to run on another node until lease timeout occurs")
}
stay
}
onTransition {
case from -> to => logInfo("ClusterSingletonManager state change [{} -> {}]", from, to)
}
onTransition {
case _ -> BecomingOldest => setTimer(HandOverRetryTimer, HandOverRetry(1), handOverRetryInterval, repeat = false)
}
onTransition {
case BecomingOldest -> _ => cancelTimer(HandOverRetryTimer)
case WasOldest -> _ => cancelTimer(TakeOverRetryTimer)
}
onTransition {
case (AcquiringLease, to) if to != Oldest =>
stateData match {
case AcquiringLeaseData(true, _) =>
logInfo("Releasing lease as leaving AcquiringLease going to [{}]", to)
import context.dispatcher
lease.foreach(l =>
pipe(l.release().map[Any](ReleaseLeaseResult).recover {
case t => ReleaseLeaseFailure(t)
}).to(self))
case _ =>
}
}
onTransition {
case Oldest -> _ =>
lease.foreach { l =>
logInfo("Releasing lease as leaving Oldest")
import context.dispatcher
pipe(l.release().map(ReleaseLeaseResult)).to(self)
}
}
onTransition {
case _ -> (Younger | Oldest) => getNextOldestChanged()
}
onTransition {
case _ -> (Younger | End) if removed.contains(cluster.selfUniqueAddress) =>
logInfo("Self removed, stopping ClusterSingletonManager")
// note that FSM.stop() can't be used in onTransition
context.stop(self)
}
}