* fix NPE in shutdownTransport
* perhaps because shutdown before started
* system.dispatcher is used in other places of the shutdown
* improve logging of compression advertisment progress
* adjust RestartFlow.withBackoff parameters
* quarantine after ActorSystemTerminating signal
(will cleanup compressions)
* Quarantine idle associations
* liveness checks by sending extra HandshakeReq and update the
lastUsed when reply received
* concervative default value to survive network partition, in
case no other messages are sent
* Adjust logging and QuarantinedEvent for harmless quarantine
* Harmless if it was via the shutdown signal or cluster leaving
158 lines
5.9 KiB
Scala
158 lines
5.9 KiB
Scala
/**
|
|
* Copyright (C) 2009-2018 Lightbend Inc. <https://www.lightbend.com>
|
|
*/
|
|
|
|
package akka.cluster
|
|
|
|
import scala.concurrent.duration.FiniteDuration
|
|
|
|
import akka.actor._
|
|
import akka.cluster.ClusterEvent.CurrentClusterState
|
|
import akka.cluster.ClusterEvent.MemberEvent
|
|
import akka.cluster.ClusterEvent.MemberJoined
|
|
import akka.cluster.ClusterEvent.MemberUp
|
|
import akka.cluster.ClusterEvent.MemberRemoved
|
|
import akka.cluster.ClusterEvent.MemberWeaklyUp
|
|
import akka.remote.FailureDetectorRegistry
|
|
import akka.remote.RemoteWatcher
|
|
import akka.remote.RARP
|
|
|
|
/**
|
|
* INTERNAL API
|
|
*/
|
|
private[cluster] object ClusterRemoteWatcher {
|
|
/**
|
|
* Factory method for `ClusterRemoteWatcher` [[akka.actor.Props]].
|
|
*/
|
|
def props(
|
|
failureDetector: FailureDetectorRegistry[Address],
|
|
heartbeatInterval: FiniteDuration,
|
|
unreachableReaperInterval: FiniteDuration,
|
|
heartbeatExpectedResponseAfter: FiniteDuration): Props =
|
|
Props(classOf[ClusterRemoteWatcher], failureDetector, heartbeatInterval, unreachableReaperInterval,
|
|
heartbeatExpectedResponseAfter).withDeploy(Deploy.local)
|
|
}
|
|
|
|
/**
|
|
* INTERNAL API
|
|
*
|
|
* Specialization of [[akka.remote.RemoteWatcher]] that keeps
|
|
* track of cluster member nodes and is responsible for watchees on cluster nodes.
|
|
* [[akka.actor.AddressTerminated]] is published when node is removed from cluster.
|
|
*
|
|
* `RemoteWatcher` handles non-cluster nodes. `ClusterRemoteWatcher` will take
|
|
* over responsibility from `RemoteWatcher` if a watch is added before a node is member
|
|
* of the cluster and then later becomes cluster member.
|
|
*/
|
|
private[cluster] class ClusterRemoteWatcher(
|
|
failureDetector: FailureDetectorRegistry[Address],
|
|
heartbeatInterval: FiniteDuration,
|
|
unreachableReaperInterval: FiniteDuration,
|
|
heartbeatExpectedResponseAfter: FiniteDuration)
|
|
extends RemoteWatcher(
|
|
failureDetector,
|
|
heartbeatInterval,
|
|
unreachableReaperInterval,
|
|
heartbeatExpectedResponseAfter) {
|
|
|
|
private val arteryEnabled = RARP(context.system).provider.remoteSettings.Artery.Enabled
|
|
val cluster = Cluster(context.system)
|
|
import cluster.selfAddress
|
|
|
|
private final case class DelayedQuarantine(m: Member, previousStatus: MemberStatus) extends NoSerializationVerificationNeeded
|
|
|
|
private var pendingDelayedQuarantine: Set[UniqueAddress] = Set.empty
|
|
|
|
var clusterNodes: Set[Address] = Set.empty
|
|
|
|
override def preStart(): Unit = {
|
|
super.preStart()
|
|
cluster.subscribe(self, classOf[MemberEvent])
|
|
}
|
|
|
|
override def postStop(): Unit = {
|
|
super.postStop()
|
|
cluster.unsubscribe(self)
|
|
}
|
|
|
|
override def receive = receiveClusterEvent orElse super.receive
|
|
|
|
def receiveClusterEvent: Actor.Receive = {
|
|
case state: CurrentClusterState ⇒
|
|
clusterNodes = state.members.collect { case m if m.address != selfAddress ⇒ m.address }
|
|
clusterNodes foreach takeOverResponsibility
|
|
unreachable = unreachable diff clusterNodes
|
|
case MemberJoined(m) ⇒ memberJoined(m)
|
|
case MemberUp(m) ⇒ memberUp(m)
|
|
case MemberWeaklyUp(m) ⇒ memberUp(m)
|
|
case MemberRemoved(m, previousStatus) ⇒ memberRemoved(m, previousStatus)
|
|
case _: MemberEvent ⇒ // not interesting
|
|
case DelayedQuarantine(m, previousStatus) ⇒ delayedQuarantine(m, previousStatus)
|
|
}
|
|
|
|
private def memberJoined(m: Member): Unit = {
|
|
if (m.address != selfAddress)
|
|
quarantineOldIncarnation(m)
|
|
}
|
|
|
|
def memberUp(m: Member): Unit =
|
|
if (m.address != selfAddress) {
|
|
quarantineOldIncarnation(m)
|
|
clusterNodes += m.address
|
|
takeOverResponsibility(m.address)
|
|
unreachable -= m.address
|
|
}
|
|
|
|
def memberRemoved(m: Member, previousStatus: MemberStatus): Unit =
|
|
if (m.address != selfAddress) {
|
|
clusterNodes -= m.address
|
|
|
|
if (previousStatus == MemberStatus.Down) {
|
|
quarantine(m.address, Some(m.uniqueAddress.longUid),
|
|
s"Cluster member removed, previous status [$previousStatus]", harmless = false)
|
|
} else if (arteryEnabled) {
|
|
// Don't quarantine gracefully removed members (leaving) directly,
|
|
// give Cluster Singleton some time to exchange TakeOver/HandOver messages.
|
|
// If new incarnation of same host:port is seen then the quarantine of previous incarnation
|
|
// is triggered earlier.
|
|
pendingDelayedQuarantine += m.uniqueAddress
|
|
import context.dispatcher
|
|
context.system.scheduler.scheduleOnce(cluster.settings.QuarantineRemovedNodeAfter, self, DelayedQuarantine(m, previousStatus))
|
|
}
|
|
|
|
publishAddressTerminated(m.address)
|
|
}
|
|
|
|
def quarantineOldIncarnation(newIncarnation: Member): Unit = {
|
|
// If new incarnation of same host:port is seen then quarantine previous incarnation
|
|
if (pendingDelayedQuarantine.nonEmpty)
|
|
pendingDelayedQuarantine.find(_.address == newIncarnation.address).foreach { oldIncarnation ⇒
|
|
pendingDelayedQuarantine -= oldIncarnation
|
|
quarantine(oldIncarnation.address, Some(oldIncarnation.longUid),
|
|
s"Cluster member removed, new incarnation joined", harmless = true)
|
|
}
|
|
}
|
|
|
|
def delayedQuarantine(m: Member, previousStatus: MemberStatus): Unit = {
|
|
if (pendingDelayedQuarantine(m.uniqueAddress)) {
|
|
pendingDelayedQuarantine -= m.uniqueAddress
|
|
quarantine(m.address, Some(m.uniqueAddress.longUid), s"Cluster member removed, previous status [$previousStatus]",
|
|
harmless = true)
|
|
}
|
|
}
|
|
|
|
override def watchNode(watchee: InternalActorRef): Unit =
|
|
if (!clusterNodes(watchee.path.address)) super.watchNode(watchee)
|
|
|
|
/**
|
|
* When a cluster node is added this class takes over the
|
|
* responsibility for watchees on that node already handled
|
|
* by super RemoteWatcher.
|
|
*/
|
|
def takeOverResponsibility(address: Address): Unit =
|
|
if (watchingNodes(address)) {
|
|
log.debug("Cluster is taking over responsibility of node: [{}]", address)
|
|
unwatchNode(address)
|
|
}
|
|
|
|
}
|