* moved to cluster tests, in new package akka.cluster.testkit * changed config in tests * migration guide * documentation clarificiations for Downing and Leaving * update warnings in Singleton and Sharding
This commit is contained in:
parent
064f06f5a6
commit
a217d5566e
61 changed files with 414 additions and 309 deletions
|
|
@ -63,7 +63,8 @@ abstract class ClusterShardingFailureSpecConfig(val mode: String) extends MultiN
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.classic.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.cluster.roles = ["backend"]
|
||||
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"
|
||||
akka.persistence.journal.leveldb-shared {
|
||||
|
|
|
|||
|
|
@ -50,7 +50,8 @@ object ClusterShardingGetStateSpecConfig extends MultiNodeConfig {
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.cluster.sharding {
|
||||
coordinator-failure-backoff = 3s
|
||||
shard-failure-backoff = 3s
|
||||
|
|
|
|||
|
|
@ -56,7 +56,8 @@ object ClusterShardingGetStatsSpecConfig extends MultiNodeConfig {
|
|||
akka.actor.provider = "cluster"
|
||||
akka.remote.classic.log-remote-lifecycle-events = off
|
||||
akka.log-dead-letters-during-shutdown = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.cluster.sharding {
|
||||
state-store-mode = "ddata"
|
||||
updating-state-timeout = 2s
|
||||
|
|
|
|||
|
|
@ -67,7 +67,8 @@ abstract class ClusterShardingLeavingSpecConfig(val mode: String) extends MultiN
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.classic.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"
|
||||
akka.persistence.journal.leveldb-shared {
|
||||
timeout = 5s
|
||||
|
|
|
|||
|
|
@ -55,7 +55,8 @@ object ClusterShardingQueriesSpecConfig extends MultiNodeConfig {
|
|||
akka.actor.provider = "cluster"
|
||||
akka.remote.classic.log-remote-lifecycle-events = off
|
||||
akka.log-dead-letters-during-shutdown = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.cluster.sharding {
|
||||
state-store-mode = "ddata"
|
||||
shard-region-query-timeout = 0ms
|
||||
|
|
|
|||
|
|
@ -61,7 +61,8 @@ abstract class ClusterShardingRememberEntitiesNewExtractorSpecConfig(val mode: S
|
|||
ConfigFactory
|
||||
.parseString(s"""
|
||||
akka.actor.provider = "cluster"
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.remote.classic.log-remote-lifecycle-events = off
|
||||
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"
|
||||
akka.persistence.journal.leveldb-shared {
|
||||
|
|
|
|||
|
|
@ -56,7 +56,8 @@ object ClusterShardingRememberEntitiesPerfSpecConfig extends MultiNodeConfig {
|
|||
commonConfig(ConfigFactory.parseString(s"""
|
||||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.testconductor.barrier-timeout = 3 minutes
|
||||
akka.remote.artery.advanced.outbound-message-queue-size = 10000
|
||||
|
|
|
|||
|
|
@ -69,7 +69,8 @@ abstract class ClusterShardingRememberEntitiesSpecConfig(val mode: String, val r
|
|||
modeConfig
|
||||
.withFallback(ConfigFactory.parseString(s"""
|
||||
akka.actor.provider = "cluster"
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.sharding.state-store-mode = "$mode"
|
||||
akka.cluster.sharding.distributed-data.durable.lmdb {
|
||||
|
|
|
|||
|
|
@ -135,7 +135,8 @@ abstract class ClusterShardingSpecConfig(val mode: String, val entityRecoveryStr
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.cluster.roles = ["backend"]
|
||||
akka.cluster.distributed-data.gossip-interval = 1s
|
||||
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"
|
||||
|
|
|
|||
|
|
@ -61,7 +61,8 @@ object MultiDcClusterShardingSpecConfig extends MultiNodeConfig {
|
|||
akka.cluster {
|
||||
debug.verbose-heartbeat-logging = on
|
||||
debug.verbose-gossip-logging = on
|
||||
auto-down-unreachable-after = 0s
|
||||
downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
testkit.auto-down-unreachable-after = 0s
|
||||
sharding {
|
||||
retry-interval = 200ms
|
||||
}
|
||||
|
|
|
|||
|
|
@ -44,7 +44,8 @@ abstract class MultiNodeClusterShardingConfig(
|
|||
.withFallback(ConfigFactory.parseString(s"""
|
||||
akka.loglevel = $loglevel
|
||||
akka.actor.provider = "cluster"
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.sharding.state-store-mode = "$mode"
|
||||
akka.cluster.sharding.distributed-data.durable.lmdb {
|
||||
|
|
|
|||
|
|
@ -36,7 +36,6 @@ import akka.pattern.ask
|
|||
import akka.pattern.pipe
|
||||
import akka.util.JavaDurationConverters._
|
||||
import akka.util.Timeout
|
||||
import com.github.ghik.silencer.silent
|
||||
import com.typesafe.config.Config
|
||||
|
||||
object ClusterSingletonManagerSettings {
|
||||
|
|
@ -45,7 +44,6 @@ object ClusterSingletonManagerSettings {
|
|||
* Create settings from the default configuration
|
||||
* `akka.cluster.singleton`.
|
||||
*/
|
||||
@silent("deprecated") // DownRemovalMargin
|
||||
def apply(system: ActorSystem): ClusterSingletonManagerSettings =
|
||||
apply(system.settings.config.getConfig("akka.cluster.singleton"))
|
||||
// note that this setting has some additional logic inside the ClusterSingletonManager
|
||||
|
|
|
|||
|
|
@ -45,7 +45,8 @@ object ClusterClientSpec extends MultiNodeConfig {
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.cluster.client.heartbeat-interval = 1s
|
||||
akka.cluster.client.acceptable-heartbeat-pause = 3s
|
||||
akka.cluster.client.refresh-contacts-interval = 1s
|
||||
|
|
|
|||
|
|
@ -30,7 +30,8 @@ object DistributedPubSubMediatorSpec extends MultiNodeConfig {
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
akka.cluster.pub-sub.max-delta-elements = 500
|
||||
"""))
|
||||
|
||||
|
|
|
|||
|
|
@ -33,7 +33,8 @@ object DistributedPubSubRestartSpec extends MultiNodeConfig {
|
|||
akka.cluster.pub-sub.gossip-interval = 500ms
|
||||
akka.actor.provider = cluster
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = off
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = off
|
||||
"""))
|
||||
|
||||
testTransport(on = true)
|
||||
|
|
|
|||
|
|
@ -35,7 +35,8 @@ object ClusterSingletonManagerChaosSpec extends MultiNodeConfig {
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
"""))
|
||||
|
||||
case object EchoStarted
|
||||
|
|
|
|||
|
|
@ -28,7 +28,8 @@ object ClusterSingletonManagerLeaseSpec extends MultiNodeConfig {
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
test-lease {
|
||||
lease-class = akka.cluster.TestLeaseActorClient
|
||||
heartbeat-interval = 1s
|
||||
|
|
|
|||
|
|
@ -33,7 +33,8 @@ object ClusterSingletonManagerLeave2Spec extends MultiNodeConfig {
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = off
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = off
|
||||
"""))
|
||||
|
||||
case object EchoStarted
|
||||
|
|
|
|||
|
|
@ -26,7 +26,8 @@ object ClusterSingletonManagerLeaveSpec extends MultiNodeConfig {
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = off
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = off
|
||||
"""))
|
||||
|
||||
case object EchoStarted
|
||||
|
|
|
|||
|
|
@ -40,7 +40,8 @@ object ClusterSingletonManagerSpec extends MultiNodeConfig {
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
"""))
|
||||
|
||||
nodeConfig(first, second, third, fourth, fifth, sixth)(ConfigFactory.parseString("akka.cluster.roles =[worker]"))
|
||||
|
|
|
|||
|
|
@ -27,7 +27,8 @@ object ClusterSingletonManagerStartupSpec extends MultiNodeConfig {
|
|||
akka.loglevel = INFO
|
||||
akka.actor.provider = "cluster"
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.cluster.auto-down-unreachable-after = 0s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s
|
||||
"""))
|
||||
|
||||
case object EchoStarted
|
||||
|
|
|
|||
|
|
@ -44,7 +44,8 @@ class ClusterSingletonLeavingSpeedSpec
|
|||
"""
|
||||
akka.loglevel = DEBUG
|
||||
akka.actor.provider = akka.cluster.ClusterActorRefProvider
|
||||
akka.cluster.auto-down-unreachable-after = 2s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 2s
|
||||
|
||||
# With 10 systems and setting min-number-of-hand-over-retries to 5 and gossip-interval to 2s it's possible to
|
||||
# reproduce the ClusterSingletonManagerIsStuck and slow hand over in issue #25639
|
||||
|
|
|
|||
|
|
@ -31,7 +31,8 @@ class ClusterSingletonRestart2Spec
|
|||
akka.loglevel = INFO
|
||||
akka.cluster.roles = [singleton]
|
||||
akka.actor.provider = akka.cluster.ClusterActorRefProvider
|
||||
akka.cluster.auto-down-unreachable-after = 2s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 2s
|
||||
akka.cluster.singleton.min-number-of-hand-over-retries = 5
|
||||
akka.remote {
|
||||
classic.netty.tcp {
|
||||
|
|
|
|||
|
|
@ -14,10 +14,12 @@ import akka.testkit.TestActors
|
|||
import akka.testkit.TestProbe
|
||||
import com.typesafe.config.ConfigFactory
|
||||
|
||||
class ClusterSingletonRestartSpec extends AkkaSpec("""
|
||||
class ClusterSingletonRestartSpec
|
||||
extends AkkaSpec("""
|
||||
akka.loglevel = INFO
|
||||
akka.actor.provider = akka.cluster.ClusterActorRefProvider
|
||||
akka.cluster.auto-down-unreachable-after = 2s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 2s
|
||||
akka.remote {
|
||||
classic.netty.tcp {
|
||||
hostname = "127.0.0.1"
|
||||
|
|
|
|||
|
|
@ -52,7 +52,6 @@ object ClusterReceptionistSpec {
|
|||
}
|
||||
|
||||
akka.cluster {
|
||||
#auto-down-unreachable-after = 0s
|
||||
jmx.multi-mbeans-in-same-jvm = on
|
||||
failure-detector.acceptable-heartbeat-pause = 3s
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,8 @@
|
|||
# #27788 Remove AutoDowning
|
||||
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown$UnreachableTimeout")
|
||||
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown$UnreachableTimeout$")
|
||||
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.ClusterSettings.AutoDownUnreachableAfter")
|
||||
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDownBase")
|
||||
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown$")
|
||||
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown")
|
||||
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDowning")
|
||||
|
|
@ -35,33 +35,17 @@ akka {
|
|||
# attempts.
|
||||
shutdown-after-unsuccessful-join-seed-nodes = off
|
||||
|
||||
# Should the 'leader' in the cluster be allowed to automatically mark
|
||||
# unreachable nodes as DOWN after a configured time of unreachability?
|
||||
# Using auto-down implies that two separate clusters will automatically be
|
||||
# formed in case of network partition.
|
||||
#
|
||||
# Don't enable this in production, see 'Auto-downing (DO NOT USE)' section
|
||||
# of Akka Cluster documentation.
|
||||
#
|
||||
# Disable with "off" or specify a duration to enable auto-down.
|
||||
# If a downing-provider-class is configured this setting is ignored.
|
||||
auto-down-unreachable-after = off
|
||||
|
||||
# Time margin after which shards or singletons that belonged to a downed/removed
|
||||
# partition are created in surviving partition. The purpose of this margin is that
|
||||
# in case of a network partition the persistent actors in the non-surviving partitions
|
||||
# must be stopped before corresponding persistent actors are started somewhere else.
|
||||
# This is useful if you implement downing strategies that handle network partitions,
|
||||
# e.g. by keeping the larger side of the partition and shutting down the smaller side.
|
||||
# It will not add any extra safety for auto-down-unreachable-after, since that is not
|
||||
# handling network partitions.
|
||||
# Disable with "off" or specify a duration to enable.
|
||||
down-removal-margin = off
|
||||
|
||||
# Pluggable support for downing of nodes in the cluster.
|
||||
# If this setting is left empty behavior will depend on 'auto-down-unreachable' in the following ways:
|
||||
# * if it is 'off' the `NoDowning` provider is used and no automatic downing will be performed
|
||||
# * if it is set to a duration the `AutoDowning` provider is with the configured downing duration
|
||||
# If this setting is left empty the `NoDowning` provider is used and no automatic downing will be performed.
|
||||
#
|
||||
# If specified the value must be the fully qualified class name of a subclass of
|
||||
# `akka.cluster.DowningProvider` having a public one argument constructor accepting an `ActorSystem`
|
||||
|
|
|
|||
|
|
@ -125,8 +125,19 @@ class Cluster(val system: ExtendedActorSystem) extends Extension {
|
|||
}
|
||||
|
||||
// needs to be lazy to allow downing provider impls to access Cluster (if not we get deadlock)
|
||||
lazy val downingProvider: DowningProvider =
|
||||
lazy val downingProvider: DowningProvider = {
|
||||
checkAutoDownUsage()
|
||||
DowningProvider.load(settings.DowningProviderClassName, system)
|
||||
}
|
||||
|
||||
private def checkAutoDownUsage(): Unit = {
|
||||
if (settings.DowningProviderClassName == "akka.cluster.AutoDowning" ||
|
||||
(settings.config.hasPath("auto-down-unreachable-after") && settings.config.getString(
|
||||
"auto-down-unreachable-after") != "off"))
|
||||
logWarning(
|
||||
"auto-down has been removed in Akka 2.6.0. See " +
|
||||
"https://doc.akka.io/docs/akka/2.6/typed/cluster.html#downing for alternatives.")
|
||||
}
|
||||
|
||||
// ========================================================
|
||||
// ===================== WORK DAEMONS =====================
|
||||
|
|
|
|||
|
|
@ -406,12 +406,17 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
|
|||
override def preStart(): Unit = {
|
||||
subscribeQuarantinedEvent()
|
||||
|
||||
cluster.downingProvider.downingActorProps.foreach { props =>
|
||||
cluster.downingProvider.downingActorProps match {
|
||||
case Some(props) =>
|
||||
val propsWithDispatcher =
|
||||
if (props.dispatcher == Deploy.NoDispatcherGiven) props.withDispatcher(context.props.dispatcher)
|
||||
else props
|
||||
|
||||
context.actorOf(propsWithDispatcher, name = "downingProvider")
|
||||
case None =>
|
||||
logInfo(
|
||||
"No downing-provider-class configured, manual cluster downing required, see " +
|
||||
"https://doc.akka.io/docs/akka/current/typed/cluster.html#downing")
|
||||
}
|
||||
|
||||
if (seedNodes.isEmpty) {
|
||||
|
|
@ -420,7 +425,7 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
|
|||
else
|
||||
logInfo(
|
||||
"No seed-nodes configured, manual cluster join required, see " +
|
||||
"https://doc.akka.io/docs/akka/current/cluster-usage.html#joining-to-seed-nodes")
|
||||
"https://doc.akka.io/docs/akka/current/typed/cluster.html#joining")
|
||||
} else {
|
||||
self ! JoinSeedNodes(seedNodes)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -116,21 +116,6 @@ final class ClusterSettings(val config: Config, val systemName: String) {
|
|||
cc.getMillisDuration(key).requiring(_ >= Duration.Zero, key + " >= 0s")
|
||||
}
|
||||
|
||||
// specific to the [[akka.cluster.DefaultDowningProvider]]
|
||||
val AutoDownUnreachableAfter: Duration = {
|
||||
val key = "auto-down-unreachable-after"
|
||||
toRootLowerCase(cc.getString(key)) match {
|
||||
case "off" => Duration.Undefined
|
||||
case _ => cc.getMillisDuration(key).requiring(_ >= Duration.Zero, key + " >= 0s, or off")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated Specific to [[akka.cluster.AutoDown]] should not be used anywhere else, instead
|
||||
* ``Cluster.downingProvider.downRemovalMargin`` should be used as it allows the downing provider to decide removal
|
||||
* margins
|
||||
*/
|
||||
@deprecated("Use Cluster.downingProvider.downRemovalMargin", since = "2.4.5")
|
||||
val DownRemovalMargin: FiniteDuration = {
|
||||
val key = "down-removal-margin"
|
||||
toRootLowerCase(cc.getString(key)) match {
|
||||
|
|
@ -142,7 +127,6 @@ final class ClusterSettings(val config: Config, val systemName: String) {
|
|||
val DowningProviderClassName: String = {
|
||||
val name = cc.getString("downing-provider-class")
|
||||
if (name.nonEmpty) name
|
||||
else if (AutoDownUnreachableAfter.isFinite) classOf[AutoDowning].getName
|
||||
else classOf[NoDowning].getName
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,6 @@ package akka.cluster
|
|||
|
||||
import akka.ConfigurationException
|
||||
import akka.actor.{ ActorSystem, ExtendedActorSystem, Props }
|
||||
import com.github.ghik.silencer.silent
|
||||
|
||||
import scala.concurrent.duration.FiniteDuration
|
||||
|
||||
|
|
@ -35,6 +34,15 @@ private[cluster] object DowningProvider {
|
|||
/**
|
||||
* API for plugins that will handle downing of cluster nodes. Concrete plugins must subclass and
|
||||
* have a public one argument constructor accepting an [[akka.actor.ActorSystem]].
|
||||
*
|
||||
* A custom `DowningProvider` can be configured with `akka.cluster.downing-provider-class`
|
||||
*
|
||||
* When implementing a downing provider you should make sure that it will not split the cluster into
|
||||
* several separate clusters in case of network problems or system overload (long GC pauses). This
|
||||
* is much more difficult than it might be perceived at first, so carefully read the concerns and scenarios
|
||||
* described in
|
||||
* https://doc.akka.io/docs/akka/current/typed/cluster.html#downing and
|
||||
* https://doc.akka.io/docs/akka-enhancements/current/split-brain-resolver.html
|
||||
*/
|
||||
abstract class DowningProvider {
|
||||
|
||||
|
|
@ -61,11 +69,9 @@ abstract class DowningProvider {
|
|||
}
|
||||
|
||||
/**
|
||||
* Default downing provider used when no provider is configured and 'auto-down-unreachable-after'
|
||||
* is not enabled.
|
||||
* Default downing provider used when no provider is configured.
|
||||
*/
|
||||
final class NoDowning(system: ActorSystem) extends DowningProvider {
|
||||
@silent("deprecated")
|
||||
override def downRemovalMargin: FiniteDuration = Cluster(system).settings.DownRemovalMargin
|
||||
override val downingActorProps: Option[Props] = None
|
||||
}
|
||||
|
|
|
|||
|
|
@ -22,7 +22,8 @@ object LeaderDowningAllOtherNodesMultiJvmSpec extends MultiNodeConfig {
|
|||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.failure-detector.monitored-by-nr-of-members = 2
|
||||
akka.cluster.auto-down-unreachable-after = 1s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 1s
|
||||
"""))
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfig))
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,7 +21,9 @@ final case class LeaderDowningNodeThatIsUnreachableMultiNodeConfig(failureDetect
|
|||
|
||||
commonConfig(
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("akka.cluster.auto-down-unreachable-after = 2s"))
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 2s"""))
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -21,7 +21,9 @@ object LeaderLeavingMultiJvmSpec extends MultiNodeConfig {
|
|||
|
||||
commonConfig(
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("akka.cluster.auto-down-unreachable-after = 0s"))
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 0s"""))
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfigWithFailureDetectorPuppet))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,8 @@ object MultiDcSplitBrainMultiJvmSpec extends MultiNodeConfig {
|
|||
akka.cluster {
|
||||
gossip-interval = 500ms
|
||||
leader-actions-interval = 1s
|
||||
auto-down-unreachable-after = 1s
|
||||
downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
testkit.auto-down-unreachable-after = 1s
|
||||
}
|
||||
""")
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfig))
|
||||
|
|
|
|||
|
|
@ -21,8 +21,10 @@ object NodeChurnMultiJvmSpec extends MultiNodeConfig {
|
|||
val third = role("third")
|
||||
|
||||
commonConfig(
|
||||
debugConfig(on = false).withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.auto-down-unreachable-after = 1s
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 1s
|
||||
akka.cluster.prune-gossip-tombstones-after = 1s
|
||||
akka.remote.classic.log-frame-size-exceeding = 1200b
|
||||
akka.remote.artery.advanced.aeron {
|
||||
|
|
@ -30,7 +32,8 @@ object NodeChurnMultiJvmSpec extends MultiNodeConfig {
|
|||
embedded-media-driver = off
|
||||
aeron-dir = "target/aeron-NodeChurnSpec"
|
||||
}
|
||||
""")).withFallback(MultiNodeClusterSpec.clusterConfig))
|
||||
"""))
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfig))
|
||||
|
||||
class LogListener(testActor: ActorRef) extends Actor {
|
||||
def receive = {
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ object NodeDowningAndBeingRemovedMultiJvmSpec extends MultiNodeConfig {
|
|||
commonConfig(
|
||||
debugConfig(on = false).withFallback(
|
||||
ConfigFactory
|
||||
.parseString("akka.cluster.auto-down-unreachable-after = off")
|
||||
.parseString("akka.cluster.testkit.auto-down-unreachable-after = off")
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfig)))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -21,10 +21,12 @@ object QuickRestartMultiJvmSpec extends MultiNodeConfig {
|
|||
val third = role("third")
|
||||
|
||||
commonConfig(
|
||||
debugConfig(on = false).withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.auto-down-unreachable-after = off
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.testkit.auto-down-unreachable-after = off
|
||||
akka.cluster.allow-weakly-up-members = off
|
||||
""")).withFallback(MultiNodeClusterSpec.clusterConfig))
|
||||
"""))
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfig))
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -28,11 +28,13 @@ object RestartFirstSeedNodeMultiJvmSpec extends MultiNodeConfig {
|
|||
val seed3 = role("seed3")
|
||||
|
||||
commonConfig(
|
||||
debugConfig(on = false).withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.auto-down-unreachable-after = off
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.testkit.auto-down-unreachable-after = off
|
||||
akka.cluster.retry-unsuccessful-join-after = 3s
|
||||
akka.cluster.allow-weakly-up-members = off
|
||||
""")).withFallback(MultiNodeClusterSpec.clusterConfig))
|
||||
"""))
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfig))
|
||||
}
|
||||
|
||||
class RestartFirstSeedNodeMultiJvmNode1 extends RestartFirstSeedNodeSpec
|
||||
|
|
|
|||
|
|
@ -28,7 +28,8 @@ object RestartNode2SpecMultiJvmSpec extends MultiNodeConfig {
|
|||
commonConfig(
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.auto-down-unreachable-after = 2s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 2s
|
||||
akka.cluster.retry-unsuccessful-join-after = 3s
|
||||
akka.cluster.allow-weakly-up-members = off
|
||||
akka.remote.retry-gate-closed-for = 45s
|
||||
|
|
|
|||
|
|
@ -29,7 +29,7 @@ object RestartNode3MultiJvmSpec extends MultiNodeConfig {
|
|||
commonConfig(
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.auto-down-unreachable-after = off
|
||||
akka.cluster.testkit.auto-down-unreachable-after = off
|
||||
akka.cluster.allow-weakly-up-members = off
|
||||
# test is using Java serialization and not priority to rewrite
|
||||
akka.actor.allow-java-serialization = on
|
||||
|
|
|
|||
|
|
@ -34,7 +34,8 @@ object RestartNodeMultiJvmSpec extends MultiNodeConfig {
|
|||
commonConfig(
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster.auto-down-unreachable-after = 5s
|
||||
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
akka.cluster.testkit.auto-down-unreachable-after = 5s
|
||||
akka.cluster.allow-weakly-up-members = off
|
||||
#akka.remote.use-passive-connections = off
|
||||
# test is using Java serialization and not priority to rewrite
|
||||
|
|
|
|||
|
|
@ -16,12 +16,16 @@ final case class SingletonClusterMultiNodeConfig(failureDetectorPuppet: Boolean)
|
|||
val first = role("first")
|
||||
val second = role("second")
|
||||
|
||||
commonConfig(debugConfig(on = false).withFallback(ConfigFactory.parseString("""
|
||||
commonConfig(
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster {
|
||||
auto-down-unreachable-after = 0s
|
||||
downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
testkit.auto-down-unreachable-after = 0s
|
||||
failure-detector.threshold = 4
|
||||
}
|
||||
""")).withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
|
||||
"""))
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
|
||||
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -21,12 +21,16 @@ final case class SplitBrainMultiNodeConfig(failureDetectorPuppet: Boolean) exten
|
|||
val fourth = role("fourth")
|
||||
val fifth = role("fifth")
|
||||
|
||||
commonConfig(debugConfig(on = false).withFallback(ConfigFactory.parseString("""
|
||||
commonConfig(
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.remote.retry-gate-closed-for = 3 s
|
||||
akka.cluster {
|
||||
auto-down-unreachable-after = 1s
|
||||
downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
testkit.auto-down-unreachable-after = 1s
|
||||
failure-detector.threshold = 4
|
||||
}""")).withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
|
||||
}"""))
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
|
||||
|
||||
testTransport(on = true)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,10 +34,14 @@ object StreamRefSpec extends MultiNodeConfig {
|
|||
val second = role("second")
|
||||
val third = role("third")
|
||||
|
||||
commonConfig(debugConfig(on = false).withFallback(ConfigFactory.parseString("""
|
||||
commonConfig(
|
||||
debugConfig(on = false)
|
||||
.withFallback(ConfigFactory.parseString("""
|
||||
akka.cluster {
|
||||
auto-down-unreachable-after = 1s
|
||||
}""")).withFallback(MultiNodeClusterSpec.clusterConfig))
|
||||
downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
testkit.auto-down-unreachable-after = 1s
|
||||
}"""))
|
||||
.withFallback(MultiNodeClusterSpec.clusterConfig))
|
||||
|
||||
testTransport(on = true)
|
||||
|
||||
|
|
|
|||
|
|
@ -119,7 +119,8 @@ private[cluster] object StressMultiJvmSpec extends MultiNodeConfig {
|
|||
akka.actor.provider = cluster
|
||||
akka.cluster {
|
||||
failure-detector.acceptable-heartbeat-pause = 10s
|
||||
auto-down-unreachable-after = 1s
|
||||
downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
testkit.auto-down-unreachable-after = 1s
|
||||
publish-stats-interval = 1s
|
||||
}
|
||||
akka.loggers = ["akka.testkit.TestEventListener"]
|
||||
|
|
|
|||
|
|
@ -42,7 +42,6 @@ class ClusterConfigSpec extends AkkaSpec {
|
|||
LeaderActionsInterval should ===(1 second)
|
||||
UnreachableNodesReaperInterval should ===(1 second)
|
||||
PublishStatsInterval should ===(Duration.Undefined)
|
||||
AutoDownUnreachableAfter should ===(Duration.Undefined)
|
||||
DownRemovalMargin should ===(Duration.Zero)
|
||||
MinNrOfMembers should ===(1)
|
||||
MinNrOfMembersOfRole should ===(Map.empty[String, Int])
|
||||
|
|
|
|||
|
|
@ -11,7 +11,8 @@ import com.typesafe.config.{ Config, ConfigFactory }
|
|||
object ClusterLogSpec {
|
||||
val config = """
|
||||
akka.cluster {
|
||||
auto-down-unreachable-after = 0s
|
||||
downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
testkit.auto-down-unreachable-after = 0s
|
||||
publish-stats-interval = 0 s # always, when it happens
|
||||
failure-detector.implementation-class = akka.cluster.FailureDetectorPuppet
|
||||
}
|
||||
|
|
|
|||
|
|
@ -30,7 +30,8 @@ import scala.concurrent.duration._
|
|||
object ClusterSpec {
|
||||
val config = """
|
||||
akka.cluster {
|
||||
auto-down-unreachable-after = 0s
|
||||
downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
testkit.auto-down-unreachable-after = 0s
|
||||
periodic-tasks-initial-delay = 120 seconds // turn off scheduled tasks
|
||||
publish-stats-interval = 0 s # always, when it happens
|
||||
failure-detector.implementation-class = akka.cluster.FailureDetectorPuppet
|
||||
|
|
|
|||
|
|
@ -6,14 +6,17 @@ package akka.cluster
|
|||
|
||||
import java.util.concurrent.atomic.AtomicBoolean
|
||||
|
||||
import scala.concurrent.duration._
|
||||
|
||||
import akka.ConfigurationException
|
||||
import akka.actor.{ ActorSystem, Props }
|
||||
import akka.testkit.TestKit.{ awaitCond, shutdownActorSystem }
|
||||
import akka.actor.ActorSystem
|
||||
import akka.actor.Props
|
||||
import akka.testkit.TestKit.awaitCond
|
||||
import akka.testkit.TestKit.shutdownActorSystem
|
||||
import akka.util.unused
|
||||
import com.typesafe.config.ConfigFactory
|
||||
import org.scalatest.{ Matchers, WordSpec }
|
||||
|
||||
import scala.concurrent.duration._
|
||||
import org.scalatest.Matchers
|
||||
import org.scalatest.WordSpec
|
||||
|
||||
class FailingDowningProvider(@unused system: ActorSystem) extends DowningProvider {
|
||||
override val downRemovalMargin: FiniteDuration = 20.seconds
|
||||
|
|
@ -39,6 +42,10 @@ class DowningProviderSpec extends WordSpec with Matchers {
|
|||
loglevel = WARNING
|
||||
actor.provider = "cluster"
|
||||
remote {
|
||||
artery.canonical {
|
||||
hostname = 127.0.0.1
|
||||
port = 0
|
||||
}
|
||||
classic.netty.tcp {
|
||||
hostname = "127.0.0.1"
|
||||
port = 0
|
||||
|
|
@ -55,16 +62,6 @@ class DowningProviderSpec extends WordSpec with Matchers {
|
|||
shutdownActorSystem(system)
|
||||
}
|
||||
|
||||
"use akka.cluster.AutoDowning if 'auto-down-unreachable-after' is configured" in {
|
||||
val system = ActorSystem(
|
||||
"auto-downing",
|
||||
ConfigFactory.parseString("""
|
||||
akka.cluster.auto-down-unreachable-after = 18d
|
||||
""").withFallback(baseConf))
|
||||
Cluster(system).downingProvider shouldBe an[AutoDowning]
|
||||
shutdownActorSystem(system)
|
||||
}
|
||||
|
||||
"use the specified downing provider" in {
|
||||
val system = ActorSystem(
|
||||
"auto-downing",
|
||||
|
|
|
|||
|
|
@ -259,7 +259,8 @@ class JoinConfigCompatCheckerSpec extends AkkaSpec with ClusterTestKit {
|
|||
akka.cluster {
|
||||
|
||||
# using explicit downing provider class
|
||||
downing-provider-class = "akka.cluster.AutoDowning"
|
||||
downing-provider-class = "akka.cluster.testkit.AutoDowning"
|
||||
testkit.auto-down-unreachable-after = 0s
|
||||
|
||||
configuration-compatibility-check {
|
||||
enforce-on-join = on
|
||||
|
|
|
|||
|
|
@ -2,17 +2,82 @@
|
|||
* Copyright (C) 2009-2019 Lightbend Inc. <https://www.lightbend.com>
|
||||
*/
|
||||
|
||||
package akka.cluster
|
||||
|
||||
import akka.ConfigurationException
|
||||
import akka.actor.{ Actor, ActorSystem, Address, Cancellable, Props, Scheduler }
|
||||
|
||||
import scala.concurrent.duration.FiniteDuration
|
||||
import akka.cluster.ClusterEvent._
|
||||
package akka.cluster.testkit
|
||||
|
||||
import scala.concurrent.duration.Duration
|
||||
import scala.concurrent.duration.FiniteDuration
|
||||
|
||||
import akka.actor.Actor
|
||||
import akka.actor.ActorLogging
|
||||
import com.github.ghik.silencer.silent
|
||||
import akka.actor.ActorSystem
|
||||
import akka.actor.Address
|
||||
import akka.actor.Cancellable
|
||||
import akka.actor.Props
|
||||
import akka.actor.Scheduler
|
||||
import akka.cluster.Cluster
|
||||
import akka.cluster.ClusterEvent._
|
||||
import akka.cluster.DowningProvider
|
||||
import akka.cluster.Member
|
||||
import akka.cluster.MembershipState
|
||||
import akka.cluster.UniqueAddress
|
||||
import akka.util.Helpers.ConfigOps
|
||||
import akka.util.Helpers.Requiring
|
||||
import akka.util.Helpers.toRootLowerCase
|
||||
|
||||
/**
|
||||
* Downing provider used for testing.
|
||||
*
|
||||
* Auto-downing is a naïve approach to remove unreachable nodes from the cluster membership.
|
||||
* In a production environment it will eventually break down the cluster.
|
||||
* When a network partition occurs, both sides of the partition will see the other side as unreachable
|
||||
* and remove it from the cluster. This results in the formation of two separate, disconnected, clusters
|
||||
* (known as *Split Brain*).
|
||||
*
|
||||
* This behavior is not limited to network partitions. It can also occur if a node in the cluster is
|
||||
* overloaded, or experiences a long GC pause.
|
||||
*
|
||||
* When using Cluster Singleton or Cluster Sharding it can break the contract provided by those features.
|
||||
* Both provide a guarantee that an actor will be unique in a cluster.
|
||||
* With the auto-down feature enabled, it is possible for multiple independent clusters to form (*Split Brain*).
|
||||
* When this happens the guaranteed uniqueness will no longer be true resulting in undesirable behavior
|
||||
* in the system.
|
||||
*
|
||||
* This is even more severe when Akka Persistence is used in conjunction with Cluster Sharding.
|
||||
* In this case, the lack of unique actors can cause multiple actors to write to the same journal.
|
||||
* Akka Persistence operates on a single writer principle. Having multiple writers will corrupt
|
||||
* the journal and make it unusable.
|
||||
*
|
||||
* Finally, even if you don't use features such as Persistence, Sharding, or Singletons, auto-downing can lead the
|
||||
* system to form multiple small clusters. These small clusters will be independent from each other. They will be
|
||||
* unable to communicate and as a result you may experience performance degradation. Once this condition occurs,
|
||||
* it will require manual intervention in order to reform the cluster.
|
||||
*
|
||||
* Because of these issues, auto-downing should never be used in a production environment.
|
||||
*/
|
||||
final class AutoDowning(system: ActorSystem) extends DowningProvider {
|
||||
|
||||
private def clusterSettings = Cluster(system).settings
|
||||
|
||||
private val AutoDownUnreachableAfter: Duration = {
|
||||
val key = "akka.cluster.testkit.auto-down-unreachable-after"
|
||||
// it's not in reference.conf, since only used in tests
|
||||
if (clusterSettings.config.hasPath(key)) {
|
||||
toRootLowerCase(clusterSettings.config.getString(key)) match {
|
||||
case "off" => Duration.Undefined
|
||||
case _ => clusterSettings.config.getMillisDuration(key).requiring(_ >= Duration.Zero, key + " >= 0s, or off")
|
||||
}
|
||||
} else
|
||||
Duration.Undefined
|
||||
}
|
||||
|
||||
override def downRemovalMargin: FiniteDuration = clusterSettings.DownRemovalMargin
|
||||
|
||||
override def downingActorProps: Option[Props] =
|
||||
AutoDownUnreachableAfter match {
|
||||
case d: FiniteDuration => Some(AutoDown.props(d))
|
||||
case _ => None // auto-down-unreachable-after = off
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* INTERNAL API
|
||||
|
|
@ -25,26 +90,6 @@ private[cluster] object AutoDown {
|
|||
final case class UnreachableTimeout(node: UniqueAddress)
|
||||
}
|
||||
|
||||
/**
|
||||
* Used when no custom provider is configured and 'auto-down-unreachable-after' is enabled.
|
||||
*/
|
||||
final class AutoDowning(system: ActorSystem) extends DowningProvider {
|
||||
|
||||
private def clusterSettings = Cluster(system).settings
|
||||
|
||||
@silent("deprecated")
|
||||
override def downRemovalMargin: FiniteDuration = clusterSettings.DownRemovalMargin
|
||||
|
||||
override def downingActorProps: Option[Props] =
|
||||
clusterSettings.AutoDownUnreachableAfter match {
|
||||
case d: FiniteDuration => Some(AutoDown.props(d))
|
||||
case _ =>
|
||||
// I don't think this can actually happen
|
||||
throw new ConfigurationException(
|
||||
"AutoDowning downing provider selected but 'akka.cluster.auto-down-unreachable-after' not set")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* INTERNAL API
|
||||
*
|
||||
|
|
@ -68,9 +113,7 @@ private[cluster] class AutoDown(autoDownUnreachableAfter: FiniteDuration)
|
|||
|
||||
// re-subscribe when restart
|
||||
override def preStart(): Unit = {
|
||||
log.warning(
|
||||
"Don't use auto-down feature of Akka Cluster in production. " +
|
||||
"See 'Auto-downing (DO NOT USE)' section of Akka Cluster documentation.")
|
||||
log.debug("Auto-down is enabled in test.")
|
||||
cluster.subscribe(self, classOf[ClusterDomainEvent])
|
||||
super.preStart()
|
||||
}
|
||||
|
|
@ -81,11 +124,7 @@ private[cluster] class AutoDown(autoDownUnreachableAfter: FiniteDuration)
|
|||
|
||||
override def down(node: Address): Unit = {
|
||||
require(leader)
|
||||
logInfo(
|
||||
"Leader is auto-downing unreachable node [{}]. " +
|
||||
"Don't use auto-down feature of Akka Cluster in production. " +
|
||||
"See 'Auto-downing (DO NOT USE)' section of Akka Cluster documentation.",
|
||||
node)
|
||||
logInfo("Leader is auto-downing unreachable node [{}].", node)
|
||||
cluster.down(node)
|
||||
}
|
||||
|
||||
|
|
@ -2,15 +2,18 @@
|
|||
* Copyright (C) 2009-2019 Lightbend Inc. <https://www.lightbend.com>
|
||||
*/
|
||||
|
||||
package akka.cluster
|
||||
package akka.cluster.testkit
|
||||
|
||||
import scala.concurrent.duration._
|
||||
import akka.actor.Address
|
||||
import akka.actor.Scheduler
|
||||
|
||||
import akka.actor.ActorRef
|
||||
import akka.actor.Address
|
||||
import akka.actor.Props
|
||||
import akka.cluster.MemberStatus._
|
||||
import akka.actor.Scheduler
|
||||
import akka.cluster.ClusterEvent._
|
||||
import akka.cluster.Member
|
||||
import akka.cluster.MemberStatus._
|
||||
import akka.cluster.TestMember
|
||||
import akka.remote.RARP
|
||||
import akka.testkit.AkkaSpec
|
||||
import akka.testkit.TimingTest
|
||||
|
|
@ -32,7 +32,8 @@ object LotsOfDataBot {
|
|||
// Override the configuration of the port
|
||||
val config = ConfigFactory
|
||||
.parseString("akka.remote.classic.netty.tcp.port=" + port)
|
||||
.withFallback(ConfigFactory.load(ConfigFactory.parseString("""
|
||||
.withFallback(
|
||||
ConfigFactory.load(ConfigFactory.parseString("""
|
||||
passive = off
|
||||
max-entries = 100000
|
||||
akka.actor.provider = "cluster"
|
||||
|
|
@ -48,7 +49,8 @@ object LotsOfDataBot {
|
|||
"akka://ClusterSystem@127.0.0.1:2551",
|
||||
"akka://ClusterSystem@127.0.0.1:2552"]
|
||||
|
||||
auto-down-unreachable-after = 10s
|
||||
downing-provider-class = akka.cluster.testkit.AutoDowning
|
||||
testkit.auto-down-unreachable-after = 10s
|
||||
}
|
||||
""")))
|
||||
|
||||
|
|
|
|||
|
|
@ -227,14 +227,6 @@ graceful leaving process of a cluster member.
|
|||
|
||||
See @ref:[removal of Internal Cluster Sharding Data](typed/cluster-sharding.md#removal-of-internal-cluster-sharding-data) in the documentation of the new APIs.
|
||||
|
||||
## Configuration
|
||||
|
||||
`ClusterShardingSettings` is a parameter to the `start` method of
|
||||
the `ClusterSharding` extension, i.e. each each entity type can be configured with different settings
|
||||
if needed.
|
||||
|
||||
See @ref:[configuration](typed/cluster-sharding.md#configuration) for more information.
|
||||
|
||||
## Inspecting cluster sharding state
|
||||
|
||||
Two requests to inspect the cluster state are available:
|
||||
|
|
@ -256,20 +248,13 @@ directly sending messages to the individual entities.
|
|||
|
||||
## Lease
|
||||
|
||||
A @ref[lease](coordination.md) can be used as an additional safety measure to ensure a shard
|
||||
does not run on two nodes.
|
||||
A lease can be used as an additional safety measure to ensure a shard does not run on two nodes.
|
||||
See @ref:[Lease](typed/cluster-sharding.md#lease) in the documentation of the new APIs.
|
||||
|
||||
Reasons for how this can happen:
|
||||
## Configuration
|
||||
|
||||
* Network partitions without an appropriate downing provider
|
||||
* Mistakes in the deployment process leading to two separate Akka Clusters
|
||||
* Timing issues between removing members from the Cluster on one side of a network partition and shutting them down on the other side
|
||||
`ClusterShardingSettings` is a parameter to the `start` method of
|
||||
the `ClusterSharding` extension, i.e. each each entity type can be configured with different settings
|
||||
if needed.
|
||||
|
||||
A lease can be a final backup that means that each shard won't create child entity actors unless it has the lease.
|
||||
|
||||
To use a lease for sharding set `akka.cluster.sharding.use-lease` to the configuration location
|
||||
of the lease to use. Each shard will try and acquire a lease with with the name `<actor system name>-shard-<type name>-<shard id>` and
|
||||
the owner is set to the `Cluster(system).selfAddress.hostPort`.
|
||||
|
||||
If a shard can't acquire a lease it will remain uninitialized so messages for entities it owns will
|
||||
be buffered in the `ShardRegion`. If the lease is lost after initialization the Shard will be terminated.
|
||||
See @ref:[configuration](typed/cluster-sharding.md#configuration) for more information.
|
||||
|
|
|
|||
|
|
@ -104,6 +104,14 @@ Scala
|
|||
Java
|
||||
: @@snip [SimpleClusterListener2.java](/akka-docs/src/test/java/jdocs/cluster/SimpleClusterListener2.java) { #join }
|
||||
|
||||
## Leaving
|
||||
|
||||
See @ref:[Leaving](typed/cluster.md#leaving) in the documentation of the new APIs.
|
||||
|
||||
## Downing
|
||||
|
||||
See @ref:[Downing](typed/cluster.md#downing) in the documentation of the new APIs.
|
||||
|
||||
<a id="cluster-subscriber"></a>
|
||||
## Subscribe to Cluster Events
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
## Commercial Support
|
||||
|
||||
Commercial support is provided by [Lightbend](http://www.lightbend.com).
|
||||
Akka is part of the [Lightbend Reactive Platform](http://www.lightbend.com/platform).
|
||||
Akka is part of the [Lightbend Platform](http://www.lightbend.com/platform).
|
||||
|
||||
## Sponsors
|
||||
|
||||
|
|
|
|||
|
|
@ -11,6 +11,40 @@ is [no longer available as a static method](https://github.com/scala/bug/issues/
|
|||
|
||||
If you are still using Scala 2.11 then you must upgrade to 2.12 or 2.13
|
||||
|
||||
## Auto-downing removed
|
||||
|
||||
Auto-downing of unreachable Cluster members have been removed after warnings and recommendations against using it
|
||||
for many years. It was by default disabled, but could be enabled with configuration
|
||||
`akka.cluster.auto-down-unreachable-after`.
|
||||
|
||||
For alternatives see the @ref:[documentation about Downing](../typed/cluster.md#downing).
|
||||
|
||||
Auto-downing was a naïve approach to remove unreachable nodes from the cluster membership.
|
||||
In a production environment it will eventually break down the cluster.
|
||||
When a network partition occurs, both sides of the partition will see the other side as unreachable
|
||||
and remove it from the cluster. This results in the formation of two separate, disconnected, clusters
|
||||
(known as *Split Brain*).
|
||||
|
||||
This behavior is not limited to network partitions. It can also occur if a node in the cluster is
|
||||
overloaded, or experiences a long GC pause.
|
||||
|
||||
When using @ref:[Cluster Singleton](../typed/cluster-singleton.md) or @ref:[Cluster Sharding](../typed/cluster-sharding.md)
|
||||
it can break the contract provided by those features. Both provide a guarantee that an actor will be unique in a cluster.
|
||||
With the auto-down feature enabled, it is possible for multiple independent clusters to form (*Split Brain*).
|
||||
When this happens the guaranteed uniqueness will no longer be true resulting in undesirable behavior in the system.
|
||||
|
||||
This is even more severe when @ref:[Akka Persistence](../typed/persistence.md) is used in conjunction with
|
||||
Cluster Sharding. In this case, the lack of unique actors can cause multiple actors to write to the same journal.
|
||||
Akka Persistence operates on a single writer principle. Having multiple writers will corrupt the journal
|
||||
and make it unusable.
|
||||
|
||||
Finally, even if you don't use features such as Persistence, Sharding, or Singletons, auto-downing can lead the
|
||||
system to form multiple small clusters. These small clusters will be independent from each other. They will be
|
||||
unable to communicate and as a result you may experience performance degradation. Once this condition occurs,
|
||||
it will require manual intervention in order to reform the cluster.
|
||||
|
||||
Because of these issues, auto-downing should **never** be used in a production environment.
|
||||
|
||||
## Removed features that were deprecated
|
||||
|
||||
After being deprecated since 2.5.0, the following have been removed in Akka 2.6.
|
||||
|
|
@ -94,13 +128,25 @@ to make remote interactions look like local method calls.
|
|||
Warnings about `TypedActor` have been [mentioned in documentation](https://doc.akka.io/docs/akka/2.5/typed-actors.html#when-to-use-typed-actors)
|
||||
for many years.
|
||||
|
||||
### akka-protobuf
|
||||
|
||||
`akka-protobuf` was never intended to be used by end users but perhaps this was not well-documented.
|
||||
Applications should use standard Protobuf dependency instead of `akka-protobuf`. The artifact is still
|
||||
published, but the transitive dependency to `akka-protobuf` has been removed.
|
||||
|
||||
Akka is now using Protobuf version 3.9.0 for serialization of messages defined by Akka.
|
||||
|
||||
### Cluster Client
|
||||
|
||||
Cluster client has been deprecated as of 2.6 in favor of [Akka gRPC](https://doc.akka.io/docs/akka-grpc/current/index.html).
|
||||
It is not advised to build new applications with Cluster client, and existing users @ref[should migrate to Akka gRPC](../cluster-client.md#migration-to-akka-grpc).
|
||||
|
||||
### akka.Main
|
||||
|
||||
`akka.Main` is deprecated in favour of starting the `ActorSystem` from a custom main class instead. `akka.Main` was not
|
||||
adding much value and typically a custom main class is needed anyway.
|
||||
|
||||
@@ Remoting
|
||||
## Remoting
|
||||
|
||||
### Default remoting is now Artery TCP
|
||||
|
||||
|
|
@ -184,20 +230,7 @@ For TCP:
|
|||
|
||||
Classic remoting is deprecated but can be used in `2.6.` Explicitly disable Artery by setting property `akka.remote.artery.enabled` to `false`. Further, any configuration under `akka.remote` that is
|
||||
specific to classic remoting needs to be moved to `akka.remote.classic`. To see which configuration options
|
||||
are specific to classic search for them in: [`akka-remote/reference.conf`](/akka-remote/src/main/resources/reference.conf)
|
||||
|
||||
### akka-protobuf
|
||||
|
||||
`akka-protobuf` was never intended to be used by end users but perhaps this was not well-documented.
|
||||
Applications should use standard Protobuf dependency instead of `akka-protobuf`. The artifact is still
|
||||
published, but the transitive dependency to `akka-protobuf` has been removed.
|
||||
|
||||
Akka is now using Protobuf version 3.9.0 for serialization of messages defined by Akka.
|
||||
|
||||
### Cluster Client
|
||||
|
||||
Cluster client has been deprecated as of 2.6 in favor of [Akka gRPC](https://doc.akka.io/docs/akka-grpc/current/index.html).
|
||||
It is not advised to build new applications with Cluster client, and existing users @ref[should migrate to Akka gRPC](../cluster-client.md#migration-to-akka-grpc).
|
||||
are specific to classic search for them in: @ref:[`akka-remote/reference.conf`](../general/configuration.md#config-akka-remote).
|
||||
|
||||
## Java Serialization
|
||||
|
||||
|
|
@ -235,14 +268,12 @@ handling that type and it was previously "accidentally" serialized with Java ser
|
|||
The following documents configuration changes and behavior changes where no action is required. In some cases the old
|
||||
behavior can be restored via configuration.
|
||||
|
||||
### Remoting
|
||||
|
||||
#### Remoting dependencies have been made optional
|
||||
### Remoting dependencies have been made optional
|
||||
|
||||
Classic remoting depends on Netty and Artery UDP depends on Aeron. These are now both optional dependencies that need
|
||||
to be explicitly added. See @ref[classic remoting](../remoting.md) or @ref[artery remoting](../remoting-artery.md) for instructions.
|
||||
|
||||
#### Remote watch and deployment have been disabled without Cluster use
|
||||
### Remote watch and deployment have been disabled without Cluster use
|
||||
|
||||
By default, these remoting features are disabled when not using Akka Cluster:
|
||||
|
||||
|
|
|
|||
|
|
@ -43,10 +43,10 @@ if that feature is enabled.
|
|||
|
||||
@@@ warning
|
||||
|
||||
**Don't use Cluster Sharding together with Automatic Downing**,
|
||||
since it allows the cluster to split up into two separate clusters, which in turn will result
|
||||
in *multiple shards and entities* being started, one in each separate cluster!
|
||||
See @ref:[Downing](cluster.md#automatic-vs-manual-downing).
|
||||
Make sure to not use a Cluster downing strategy that may split the cluster into several separate clusters in
|
||||
case of network problems or system overload (long GC pauses), since that will result in *multiple shards and entities*
|
||||
being started, one in each separate cluster!
|
||||
See @ref:[Downing](cluster.md#downing).
|
||||
|
||||
@@@
|
||||
|
||||
|
|
@ -304,6 +304,26 @@ rebalanced to other nodes.
|
|||
See @ref:[How To Startup when Cluster Size Reached](cluster.md#how-to-startup-when-a-cluster-size-is-reached)
|
||||
for more information about `min-nr-of-members`.
|
||||
|
||||
## Lease
|
||||
|
||||
A @ref[lease](../coordination.md) can be used as an additional safety measure to ensure a shard
|
||||
does not run on two nodes.
|
||||
|
||||
Reasons for how this can happen:
|
||||
|
||||
* Network partitions without an appropriate downing provider
|
||||
* Mistakes in the deployment process leading to two separate Akka Clusters
|
||||
* Timing issues between removing members from the Cluster on one side of a network partition and shutting them down on the other side
|
||||
|
||||
A lease can be a final backup that means that each shard won't create child entity actors unless it has the lease.
|
||||
|
||||
To use a lease for sharding set `akka.cluster.sharding.use-lease` to the configuration location
|
||||
of the lease to use. Each shard will try and acquire a lease with with the name `<actor system name>-shard-<type name>-<shard id>` and
|
||||
the owner is set to the `Cluster(system).selfAddress.hostPort`.
|
||||
|
||||
If a shard can't acquire a lease it will remain uninitialized so messages for entities it owns will
|
||||
be buffered in the `ShardRegion`. If the lease is lost after initialization the Shard will be terminated.
|
||||
|
||||
## Removal of internal Cluster Sharding data
|
||||
|
||||
Removal of internal Cluster Sharding data is only relevant for "Persistent Mode".
|
||||
|
|
@ -326,15 +346,6 @@ cannot startup because of corrupt data, which may happen if accidentally
|
|||
two clusters were running at the same time, e.g. caused by using auto-down
|
||||
and there was a network partition.
|
||||
|
||||
@@@ warning
|
||||
|
||||
**Don't use Cluster Sharding together with Automatic Downing**,
|
||||
since it allows the cluster to split up into two separate clusters, which in turn will result
|
||||
in *multiple shards and entities* being started, one in each separate cluster!
|
||||
See @ref:[Downing](cluster.md#automatic-vs-manual-downing).
|
||||
|
||||
@@@
|
||||
|
||||
Use this program as a standalone Java main program:
|
||||
|
||||
```
|
||||
|
|
@ -347,7 +358,7 @@ The program is included in the `akka-cluster-sharding` jar file. It
|
|||
is easiest to run it with same classpath and configuration as your ordinary
|
||||
application. It can be run from sbt or Maven in similar way.
|
||||
|
||||
Specify the entity type names (same as you use in the `start` method
|
||||
Specify the entity type names (same as you use in the `init` method
|
||||
of `ClusterSharding`) as program arguments.
|
||||
|
||||
If you specify `-2.3` as the first program argument it will also try
|
||||
|
|
|
|||
|
|
@ -32,6 +32,15 @@ such as single-point of bottleneck. Single-point of failure is also a relevant c
|
|||
but for some cases this feature takes care of that by making sure that another singleton
|
||||
instance will eventually be started.
|
||||
|
||||
@@@ warning
|
||||
|
||||
Make sure to not use a Cluster downing strategy that may split the cluster into several separate clusters in
|
||||
case of network problems or system overload (long GC pauses), since that will result in in *multiple Singletons*
|
||||
being started, one in each separate cluster!
|
||||
See @ref:[Downing](cluster.md#downing).
|
||||
|
||||
@@@
|
||||
|
||||
### Singleton manager
|
||||
|
||||
The cluster singleton pattern manages one singleton actor instance among all cluster nodes or a group of nodes tagged with
|
||||
|
|
@ -80,22 +89,19 @@ The singleton instance will not run on members with status @ref:[WeaklyUp](clust
|
|||
|
||||
This pattern may seem to be very tempting to use at first, but it has several drawbacks, some of them are listed below:
|
||||
|
||||
* the cluster singleton may quickly become a *performance bottleneck*,
|
||||
* you can not rely on the cluster singleton to be *non-stop* available — e.g. when the node on which the singleton has
|
||||
been running dies, it will take a few seconds for this to be noticed and the singleton be migrated to another node,
|
||||
* in the case of a *network partition* appearing in a Cluster that is using Automatic Downing (see docs for
|
||||
@ref:[Auto Downing](cluster.md#auto-downing-do-not-use),
|
||||
it may happen that the isolated clusters each decide to spin up their own singleton, meaning that there might be multiple
|
||||
singletons running in the system, yet the Clusters have no way of finding out about them (because of the partition).
|
||||
|
||||
Especially the last point is something you should be aware of — in general when using the Cluster Singleton pattern
|
||||
you should take care of downing nodes yourself and not rely on the timing based auto-down feature.
|
||||
* The cluster singleton may quickly become a *performance bottleneck*.
|
||||
* You can not rely on the cluster singleton to be *non-stop* available — e.g. when the node on which the singleton
|
||||
has been running dies, it will take a few seconds for this to be noticed and the singleton be migrated to another node.
|
||||
* If many singletons are used be aware of that all will run on the oldest node (or oldest with configured role).
|
||||
@ref:[Cluster Sharding](cluster-sharding.md) combined with keeping the "singleton" entities alive can be a better
|
||||
alternative.
|
||||
|
||||
@@@ warning
|
||||
|
||||
**Don't use Cluster Singleton together with Automatic Downing**,
|
||||
since it allows the cluster to split up into two separate clusters, which in turn will result
|
||||
in *multiple Singletons* being started, one in each separate cluster!
|
||||
Make sure to not use a Cluster downing strategy that may split the cluster into several separate clusters in
|
||||
case of network problems or system overload (long GC pauses), since that will result in in *multiple Singletons*
|
||||
being started, one in each separate cluster!
|
||||
See @ref:[Downing](cluster.md#downing).
|
||||
|
||||
@@@
|
||||
|
||||
|
|
|
|||
|
|
@ -255,95 +255,69 @@ after the restart, when it come up as new incarnation of existing member in the
|
|||
trying to join in, then the existing one will be removed from the cluster and then it will
|
||||
be allowed to join.
|
||||
|
||||
<a id="automatic-vs-manual-downing"></a>
|
||||
### Downing
|
||||
|
||||
When a member is considered by the failure detector to be `unreachable` the
|
||||
leader is not allowed to perform its duties, such as changing status of
|
||||
new joining members to 'Up'. The node must first become `reachable` again, or the
|
||||
status of the unreachable member must be changed to 'Down'. Changing status to 'Down'
|
||||
can be performed automatically or manually. By default it must be done manually, using
|
||||
@ref:[JMX](../additional/operations.md#jmx) or @ref:[HTTP](../additional/operations.md#http).
|
||||
|
||||
It can also be performed programmatically with @scala[`Cluster(system).down(address)`]@java[`Cluster.get(system).down(address)`].
|
||||
|
||||
If a node is still running and sees its self as Down it will shutdown. @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will automatically
|
||||
run if `run-coordinated-shutdown-when-down` is set to `on` (the default) however the node will not try
|
||||
and leave the cluster gracefully so sharding and singleton migration will not occur.
|
||||
|
||||
A production solution for the downing problem is provided by
|
||||
[Split Brain Resolver](http://developer.lightbend.com/docs/akka-commercial-addons/current/split-brain-resolver.html),
|
||||
which is part of the [Lightbend Reactive Platform](http://www.lightbend.com/platform).
|
||||
If you don’t use RP, you should anyway carefully read the [documentation](http://developer.lightbend.com/docs/akka-commercial-addons/current/split-brain-resolver.html)
|
||||
of the Split Brain Resolver and make sure that the solution you are using handles the concerns
|
||||
described there.
|
||||
|
||||
### Auto-downing - DO NOT USE
|
||||
|
||||
There is an automatic downing feature that you should not use in production. For testing you can enable it with configuration:
|
||||
|
||||
```
|
||||
akka.cluster.auto-down-unreachable-after = 120s
|
||||
```
|
||||
|
||||
This means that the cluster leader member will change the `unreachable` node
|
||||
status to `down` automatically after the configured time of unreachability.
|
||||
|
||||
This is a naïve approach to remove unreachable nodes from the cluster membership.
|
||||
It can be useful during development but in a production environment it will eventually breakdown the cluster.
|
||||
When a network partition occurs, both sides of the partition will see the other side as unreachable and remove it from the cluster.
|
||||
This results in the formation of two separate, disconnected, clusters (known as *Split Brain*).
|
||||
|
||||
This behaviour is not limited to network partitions. It can also occur if a node
|
||||
in the cluster is overloaded, or experiences a long GC pause.
|
||||
|
||||
@@@ warning
|
||||
|
||||
We recommend against using the auto-down feature of Akka Cluster in production. It
|
||||
has multiple undesirable consequences for production systems.
|
||||
|
||||
If you are using @ref:[Cluster Singleton](cluster-singleton.md) or @ref:[Cluster Sharding](cluster-sharding.md) it can break the contract provided by
|
||||
those features. Both provide a guarantee that an actor will be unique in a cluster.
|
||||
With the auto-down feature enabled, it is possible for multiple independent clusters
|
||||
to form (*Split Brain*). When this happens the guaranteed uniqueness will no
|
||||
longer be true resulting in undesirable behaviour in the system.
|
||||
|
||||
This is even more severe when @ref:[Akka Persistence](persistence.md) is used in
|
||||
conjunction with Cluster Sharding. In this case, the lack of unique actors can
|
||||
cause multiple actors to write to the same journal. Akka Persistence operates on a
|
||||
single writer principle. Having multiple writers will corrupt the journal
|
||||
and make it unusable.
|
||||
|
||||
Finally, even if you don't use features such as Persistence, Sharding, or Singletons,
|
||||
auto-downing can lead the system to form multiple small clusters. These small
|
||||
clusters will be independent from each other. They will be unable to communicate
|
||||
and as a result you may experience performance degradation. Once this condition
|
||||
occurs, it will require manual intervention in order to reform the cluster.
|
||||
|
||||
Because of these issues, auto-downing should **never** be used in a production environment.
|
||||
|
||||
@@@
|
||||
|
||||
### Leaving
|
||||
|
||||
There are two ways to remove a member from the cluster.
|
||||
There are a few ways to remove a member from the cluster.
|
||||
|
||||
1. The recommended way to leave a cluster is a graceful exit, informing the cluster that a node shall leave.
|
||||
This can be performed using @ref:[JMX](../additional/operations.md#jmx) or @ref:[HTTP](../additional/operations.md#http).
|
||||
This method will offer faster hand off to peer nodes during node shutdown.
|
||||
1. When a graceful exit is not possible, you can stop the actor system (or the JVM process, for example a SIGTERM sent from the environment). It will be detected
|
||||
as unreachable and removed after the automatic or manual downing.
|
||||
This is performed by @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) when the `ActorSystem`
|
||||
is terminated and also when a SIGTERM is sent from the environment to stop the JVM process.
|
||||
1. Graceful exit can also be performed using @ref:[HTTP](../additional/operations.md#http) or @ref:[JMX](../additional/operations.md#jmx).
|
||||
1. When a graceful exit is not possible, for example in case of abrupt termination of the the JVM process, the node
|
||||
will be detected as unreachable by other nodes and removed after @ref:[Downing](#downing).
|
||||
|
||||
The @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will automatically run when the cluster node sees itself as
|
||||
Graceful leaving will offer faster hand off to peer nodes during node shutdown than abrupt termination and downing.
|
||||
|
||||
The @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will also run when the cluster node sees itself as
|
||||
`Exiting`, i.e. leaving from another node will trigger the shutdown process on the leaving node.
|
||||
Tasks for graceful leaving of cluster including graceful shutdown of Cluster Singletons and
|
||||
Cluster Sharding are added automatically when Akka Cluster is used, i.e. running the shutdown
|
||||
process will also trigger the graceful leaving if it's not already in progress.
|
||||
|
||||
Normally this is handled automatically, but in case of network failures during this process it might still
|
||||
be necessary to set the node’s status to `Down` in order to complete the removal. For handling network failures
|
||||
see [Split Brain Resolver](http://developer.lightbend.com/docs/akka-commercial-addons/current/split-brain-resolver.html),
|
||||
part of the [Lightbend Reactive Platform](http://www.lightbend.com/platform).
|
||||
be necessary to set the node’s status to `Down` in order to complete the removal, see @ref:[Downing](#downing).
|
||||
|
||||
### Downing
|
||||
|
||||
In many cases a member can gracefully exit from the cluster as described in @ref:[Leaving](#leaving), but
|
||||
there are scenarios when an explicit downing decision is needed before it can be removed. For example in case
|
||||
of abrupt termination of the the JVM process, system overload that doesn't recover, or network partitions
|
||||
that don't heal. I such cases the node(s) will be detected as unreachable by other nodes, but they must also
|
||||
be marked as `Down` before they are removed.
|
||||
|
||||
When a member is considered by the failure detector to be `unreachable` the
|
||||
leader is not allowed to perform its duties, such as changing status of
|
||||
new joining members to 'Up'. The node must first become `reachable` again, or the
|
||||
status of the unreachable member must be changed to `Down`. Changing status to `Down`
|
||||
can be performed automatically or manually.
|
||||
|
||||
By default, downing must be performed manually using @ref:[HTTP](../additional/operations.md#http) or @ref:[JMX](../additional/operations.md#jmx).
|
||||
|
||||
Note that @ref:[Cluster Singleton](cluster-singleton.md) or @ref:[Cluster Sharding entities](cluster-sharding.md) that
|
||||
are running on a crashed (unreachable) node will not be started on another node until the previous node has
|
||||
been removed from the Cluster. Removal of crashed (unreachable) nodes is performed after a downing decision.
|
||||
|
||||
A production solution for downing is provided by
|
||||
[Split Brain Resolver](https://doc.akka.io/docs/akka-enhancements/current/split-brain-resolver.html),
|
||||
which is part of the [Lightbend Platform](http://www.lightbend.com/platform).
|
||||
If you don’t have a Lightbend Platform Subscription, you should still carefully read the
|
||||
[documentation](https://doc.akka.io/docs/akka-enhancements/current/split-brain-resolver.html)
|
||||
of the Split Brain Resolver and make sure that the solution you are using handles the concerns and scenarios
|
||||
described there.
|
||||
|
||||
A custom downing strategy can be implemented with a @apidoc[akka.cluster.DowningProvider] and enabled with
|
||||
configuration `akka.cluster.downing-provider-class`.
|
||||
|
||||
Downing can also be performed programmatically with @scala[`Cluster(system).manager ! Down(address)`]@java[`Cluster.get(system).manager().tell(Down(address))`],
|
||||
but that is mostly useful from tests and when implementing a `DowningProvider`.
|
||||
|
||||
If a crashed node is restarted with the same hostname and port and joining the cluster again the previous incarnation
|
||||
of that member will be downed and removed. The new join attempt with same hostname and port is used as evidence
|
||||
that the previous is not alive any more.
|
||||
|
||||
If a node is still running and sees its self as `Down` it will shutdown. @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will automatically
|
||||
run if `run-coordinated-shutdown-when-down` is set to `on` (the default) however the node will not try
|
||||
and leave the cluster gracefully.
|
||||
|
||||
## Node Roles
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue