Remove auto-downing, #27788 (#27855)

* moved to cluster tests, in new package akka.cluster.testkit
* changed config in tests
* migration guide
* documentation clarificiations for Downing and Leaving
* update warnings in Singleton and Sharding
This commit is contained in:
Patrik Nordwall 2019-10-03 14:08:43 +02:00 committed by GitHub
parent 064f06f5a6
commit a217d5566e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
61 changed files with 414 additions and 309 deletions

View file

@ -63,7 +63,8 @@ abstract class ClusterShardingFailureSpecConfig(val mode: String) extends MultiN
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.classic.log-remote-lifecycle-events = off akka.remote.classic.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.roles = ["backend"] akka.cluster.roles = ["backend"]
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared" akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"
akka.persistence.journal.leveldb-shared { akka.persistence.journal.leveldb-shared {

View file

@ -50,7 +50,8 @@ object ClusterShardingGetStateSpecConfig extends MultiNodeConfig {
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.sharding { akka.cluster.sharding {
coordinator-failure-backoff = 3s coordinator-failure-backoff = 3s
shard-failure-backoff = 3s shard-failure-backoff = 3s

View file

@ -56,7 +56,8 @@ object ClusterShardingGetStatsSpecConfig extends MultiNodeConfig {
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.classic.log-remote-lifecycle-events = off akka.remote.classic.log-remote-lifecycle-events = off
akka.log-dead-letters-during-shutdown = off akka.log-dead-letters-during-shutdown = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.sharding { akka.cluster.sharding {
state-store-mode = "ddata" state-store-mode = "ddata"
updating-state-timeout = 2s updating-state-timeout = 2s

View file

@ -67,7 +67,8 @@ abstract class ClusterShardingLeavingSpecConfig(val mode: String) extends MultiN
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.classic.log-remote-lifecycle-events = off akka.remote.classic.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared" akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"
akka.persistence.journal.leveldb-shared { akka.persistence.journal.leveldb-shared {
timeout = 5s timeout = 5s

View file

@ -55,7 +55,8 @@ object ClusterShardingQueriesSpecConfig extends MultiNodeConfig {
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.classic.log-remote-lifecycle-events = off akka.remote.classic.log-remote-lifecycle-events = off
akka.log-dead-letters-during-shutdown = off akka.log-dead-letters-during-shutdown = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.sharding { akka.cluster.sharding {
state-store-mode = "ddata" state-store-mode = "ddata"
shard-region-query-timeout = 0ms shard-region-query-timeout = 0ms

View file

@ -61,7 +61,8 @@ abstract class ClusterShardingRememberEntitiesNewExtractorSpecConfig(val mode: S
ConfigFactory ConfigFactory
.parseString(s""" .parseString(s"""
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.remote.classic.log-remote-lifecycle-events = off akka.remote.classic.log-remote-lifecycle-events = off
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared" akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"
akka.persistence.journal.leveldb-shared { akka.persistence.journal.leveldb-shared {

View file

@ -56,7 +56,8 @@ object ClusterShardingRememberEntitiesPerfSpecConfig extends MultiNodeConfig {
commonConfig(ConfigFactory.parseString(s""" commonConfig(ConfigFactory.parseString(s"""
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.testconductor.barrier-timeout = 3 minutes akka.testconductor.barrier-timeout = 3 minutes
akka.remote.artery.advanced.outbound-message-queue-size = 10000 akka.remote.artery.advanced.outbound-message-queue-size = 10000

View file

@ -69,7 +69,8 @@ abstract class ClusterShardingRememberEntitiesSpecConfig(val mode: String, val r
modeConfig modeConfig
.withFallback(ConfigFactory.parseString(s""" .withFallback(ConfigFactory.parseString(s"""
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.sharding.state-store-mode = "$mode" akka.cluster.sharding.state-store-mode = "$mode"
akka.cluster.sharding.distributed-data.durable.lmdb { akka.cluster.sharding.distributed-data.durable.lmdb {

View file

@ -135,7 +135,8 @@ abstract class ClusterShardingSpecConfig(val mode: String, val entityRecoveryStr
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.roles = ["backend"] akka.cluster.roles = ["backend"]
akka.cluster.distributed-data.gossip-interval = 1s akka.cluster.distributed-data.gossip-interval = 1s
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared" akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"

View file

@ -61,7 +61,8 @@ object MultiDcClusterShardingSpecConfig extends MultiNodeConfig {
akka.cluster { akka.cluster {
debug.verbose-heartbeat-logging = on debug.verbose-heartbeat-logging = on
debug.verbose-gossip-logging = on debug.verbose-gossip-logging = on
auto-down-unreachable-after = 0s downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 0s
sharding { sharding {
retry-interval = 200ms retry-interval = 200ms
} }

View file

@ -44,7 +44,8 @@ abstract class MultiNodeClusterShardingConfig(
.withFallback(ConfigFactory.parseString(s""" .withFallback(ConfigFactory.parseString(s"""
akka.loglevel = $loglevel akka.loglevel = $loglevel
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.sharding.state-store-mode = "$mode" akka.cluster.sharding.state-store-mode = "$mode"
akka.cluster.sharding.distributed-data.durable.lmdb { akka.cluster.sharding.distributed-data.durable.lmdb {

View file

@ -36,7 +36,6 @@ import akka.pattern.ask
import akka.pattern.pipe import akka.pattern.pipe
import akka.util.JavaDurationConverters._ import akka.util.JavaDurationConverters._
import akka.util.Timeout import akka.util.Timeout
import com.github.ghik.silencer.silent
import com.typesafe.config.Config import com.typesafe.config.Config
object ClusterSingletonManagerSettings { object ClusterSingletonManagerSettings {
@ -45,7 +44,6 @@ object ClusterSingletonManagerSettings {
* Create settings from the default configuration * Create settings from the default configuration
* `akka.cluster.singleton`. * `akka.cluster.singleton`.
*/ */
@silent("deprecated") // DownRemovalMargin
def apply(system: ActorSystem): ClusterSingletonManagerSettings = def apply(system: ActorSystem): ClusterSingletonManagerSettings =
apply(system.settings.config.getConfig("akka.cluster.singleton")) apply(system.settings.config.getConfig("akka.cluster.singleton"))
// note that this setting has some additional logic inside the ClusterSingletonManager // note that this setting has some additional logic inside the ClusterSingletonManager

View file

@ -45,7 +45,8 @@ object ClusterClientSpec extends MultiNodeConfig {
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.client.heartbeat-interval = 1s akka.cluster.client.heartbeat-interval = 1s
akka.cluster.client.acceptable-heartbeat-pause = 3s akka.cluster.client.acceptable-heartbeat-pause = 3s
akka.cluster.client.refresh-contacts-interval = 1s akka.cluster.client.refresh-contacts-interval = 1s

View file

@ -30,7 +30,8 @@ object DistributedPubSubMediatorSpec extends MultiNodeConfig {
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.pub-sub.max-delta-elements = 500 akka.cluster.pub-sub.max-delta-elements = 500
""")) """))

View file

@ -33,7 +33,8 @@ object DistributedPubSubRestartSpec extends MultiNodeConfig {
akka.cluster.pub-sub.gossip-interval = 500ms akka.cluster.pub-sub.gossip-interval = 500ms
akka.actor.provider = cluster akka.actor.provider = cluster
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = off akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = off
""")) """))
testTransport(on = true) testTransport(on = true)

View file

@ -35,7 +35,8 @@ object ClusterSingletonManagerChaosSpec extends MultiNodeConfig {
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
""")) """))
case object EchoStarted case object EchoStarted

View file

@ -28,7 +28,8 @@ object ClusterSingletonManagerLeaseSpec extends MultiNodeConfig {
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
test-lease { test-lease {
lease-class = akka.cluster.TestLeaseActorClient lease-class = akka.cluster.TestLeaseActorClient
heartbeat-interval = 1s heartbeat-interval = 1s

View file

@ -33,7 +33,8 @@ object ClusterSingletonManagerLeave2Spec extends MultiNodeConfig {
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = off akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = off
""")) """))
case object EchoStarted case object EchoStarted

View file

@ -26,7 +26,8 @@ object ClusterSingletonManagerLeaveSpec extends MultiNodeConfig {
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = off akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = off
""")) """))
case object EchoStarted case object EchoStarted

View file

@ -40,7 +40,8 @@ object ClusterSingletonManagerSpec extends MultiNodeConfig {
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
""")) """))
nodeConfig(first, second, third, fourth, fifth, sixth)(ConfigFactory.parseString("akka.cluster.roles =[worker]")) nodeConfig(first, second, third, fourth, fifth, sixth)(ConfigFactory.parseString("akka.cluster.roles =[worker]"))

View file

@ -27,7 +27,8 @@ object ClusterSingletonManagerStartupSpec extends MultiNodeConfig {
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
""")) """))
case object EchoStarted case object EchoStarted

View file

@ -44,7 +44,8 @@ class ClusterSingletonLeavingSpeedSpec
""" """
akka.loglevel = DEBUG akka.loglevel = DEBUG
akka.actor.provider = akka.cluster.ClusterActorRefProvider akka.actor.provider = akka.cluster.ClusterActorRefProvider
akka.cluster.auto-down-unreachable-after = 2s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 2s
# With 10 systems and setting min-number-of-hand-over-retries to 5 and gossip-interval to 2s it's possible to # With 10 systems and setting min-number-of-hand-over-retries to 5 and gossip-interval to 2s it's possible to
# reproduce the ClusterSingletonManagerIsStuck and slow hand over in issue #25639 # reproduce the ClusterSingletonManagerIsStuck and slow hand over in issue #25639

View file

@ -31,7 +31,8 @@ class ClusterSingletonRestart2Spec
akka.loglevel = INFO akka.loglevel = INFO
akka.cluster.roles = [singleton] akka.cluster.roles = [singleton]
akka.actor.provider = akka.cluster.ClusterActorRefProvider akka.actor.provider = akka.cluster.ClusterActorRefProvider
akka.cluster.auto-down-unreachable-after = 2s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 2s
akka.cluster.singleton.min-number-of-hand-over-retries = 5 akka.cluster.singleton.min-number-of-hand-over-retries = 5
akka.remote { akka.remote {
classic.netty.tcp { classic.netty.tcp {

View file

@ -14,10 +14,12 @@ import akka.testkit.TestActors
import akka.testkit.TestProbe import akka.testkit.TestProbe
import com.typesafe.config.ConfigFactory import com.typesafe.config.ConfigFactory
class ClusterSingletonRestartSpec extends AkkaSpec(""" class ClusterSingletonRestartSpec
extends AkkaSpec("""
akka.loglevel = INFO akka.loglevel = INFO
akka.actor.provider = akka.cluster.ClusterActorRefProvider akka.actor.provider = akka.cluster.ClusterActorRefProvider
akka.cluster.auto-down-unreachable-after = 2s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 2s
akka.remote { akka.remote {
classic.netty.tcp { classic.netty.tcp {
hostname = "127.0.0.1" hostname = "127.0.0.1"

View file

@ -52,7 +52,6 @@ object ClusterReceptionistSpec {
} }
akka.cluster { akka.cluster {
#auto-down-unreachable-after = 0s
jmx.multi-mbeans-in-same-jvm = on jmx.multi-mbeans-in-same-jvm = on
failure-detector.acceptable-heartbeat-pause = 3s failure-detector.acceptable-heartbeat-pause = 3s
} }

View file

@ -0,0 +1,8 @@
# #27788 Remove AutoDowning
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown$UnreachableTimeout")
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown$UnreachableTimeout$")
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.ClusterSettings.AutoDownUnreachableAfter")
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDownBase")
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown$")
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown")
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDowning")

View file

@ -35,33 +35,17 @@ akka {
# attempts. # attempts.
shutdown-after-unsuccessful-join-seed-nodes = off shutdown-after-unsuccessful-join-seed-nodes = off
# Should the 'leader' in the cluster be allowed to automatically mark
# unreachable nodes as DOWN after a configured time of unreachability?
# Using auto-down implies that two separate clusters will automatically be
# formed in case of network partition.
#
# Don't enable this in production, see 'Auto-downing (DO NOT USE)' section
# of Akka Cluster documentation.
#
# Disable with "off" or specify a duration to enable auto-down.
# If a downing-provider-class is configured this setting is ignored.
auto-down-unreachable-after = off
# Time margin after which shards or singletons that belonged to a downed/removed # Time margin after which shards or singletons that belonged to a downed/removed
# partition are created in surviving partition. The purpose of this margin is that # partition are created in surviving partition. The purpose of this margin is that
# in case of a network partition the persistent actors in the non-surviving partitions # in case of a network partition the persistent actors in the non-surviving partitions
# must be stopped before corresponding persistent actors are started somewhere else. # must be stopped before corresponding persistent actors are started somewhere else.
# This is useful if you implement downing strategies that handle network partitions, # This is useful if you implement downing strategies that handle network partitions,
# e.g. by keeping the larger side of the partition and shutting down the smaller side. # e.g. by keeping the larger side of the partition and shutting down the smaller side.
# It will not add any extra safety for auto-down-unreachable-after, since that is not
# handling network partitions.
# Disable with "off" or specify a duration to enable. # Disable with "off" or specify a duration to enable.
down-removal-margin = off down-removal-margin = off
# Pluggable support for downing of nodes in the cluster. # Pluggable support for downing of nodes in the cluster.
# If this setting is left empty behavior will depend on 'auto-down-unreachable' in the following ways: # If this setting is left empty the `NoDowning` provider is used and no automatic downing will be performed.
# * if it is 'off' the `NoDowning` provider is used and no automatic downing will be performed
# * if it is set to a duration the `AutoDowning` provider is with the configured downing duration
# #
# If specified the value must be the fully qualified class name of a subclass of # If specified the value must be the fully qualified class name of a subclass of
# `akka.cluster.DowningProvider` having a public one argument constructor accepting an `ActorSystem` # `akka.cluster.DowningProvider` having a public one argument constructor accepting an `ActorSystem`

View file

@ -125,8 +125,19 @@ class Cluster(val system: ExtendedActorSystem) extends Extension {
} }
// needs to be lazy to allow downing provider impls to access Cluster (if not we get deadlock) // needs to be lazy to allow downing provider impls to access Cluster (if not we get deadlock)
lazy val downingProvider: DowningProvider = lazy val downingProvider: DowningProvider = {
checkAutoDownUsage()
DowningProvider.load(settings.DowningProviderClassName, system) DowningProvider.load(settings.DowningProviderClassName, system)
}
private def checkAutoDownUsage(): Unit = {
if (settings.DowningProviderClassName == "akka.cluster.AutoDowning" ||
(settings.config.hasPath("auto-down-unreachable-after") && settings.config.getString(
"auto-down-unreachable-after") != "off"))
logWarning(
"auto-down has been removed in Akka 2.6.0. See " +
"https://doc.akka.io/docs/akka/2.6/typed/cluster.html#downing for alternatives.")
}
// ======================================================== // ========================================================
// ===================== WORK DAEMONS ===================== // ===================== WORK DAEMONS =====================

View file

@ -406,12 +406,17 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
override def preStart(): Unit = { override def preStart(): Unit = {
subscribeQuarantinedEvent() subscribeQuarantinedEvent()
cluster.downingProvider.downingActorProps.foreach { props => cluster.downingProvider.downingActorProps match {
val propsWithDispatcher = case Some(props) =>
if (props.dispatcher == Deploy.NoDispatcherGiven) props.withDispatcher(context.props.dispatcher) val propsWithDispatcher =
else props if (props.dispatcher == Deploy.NoDispatcherGiven) props.withDispatcher(context.props.dispatcher)
else props
context.actorOf(propsWithDispatcher, name = "downingProvider") context.actorOf(propsWithDispatcher, name = "downingProvider")
case None =>
logInfo(
"No downing-provider-class configured, manual cluster downing required, see " +
"https://doc.akka.io/docs/akka/current/typed/cluster.html#downing")
} }
if (seedNodes.isEmpty) { if (seedNodes.isEmpty) {
@ -420,7 +425,7 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
else else
logInfo( logInfo(
"No seed-nodes configured, manual cluster join required, see " + "No seed-nodes configured, manual cluster join required, see " +
"https://doc.akka.io/docs/akka/current/cluster-usage.html#joining-to-seed-nodes") "https://doc.akka.io/docs/akka/current/typed/cluster.html#joining")
} else { } else {
self ! JoinSeedNodes(seedNodes) self ! JoinSeedNodes(seedNodes)
} }

View file

@ -116,21 +116,6 @@ final class ClusterSettings(val config: Config, val systemName: String) {
cc.getMillisDuration(key).requiring(_ >= Duration.Zero, key + " >= 0s") cc.getMillisDuration(key).requiring(_ >= Duration.Zero, key + " >= 0s")
} }
// specific to the [[akka.cluster.DefaultDowningProvider]]
val AutoDownUnreachableAfter: Duration = {
val key = "auto-down-unreachable-after"
toRootLowerCase(cc.getString(key)) match {
case "off" => Duration.Undefined
case _ => cc.getMillisDuration(key).requiring(_ >= Duration.Zero, key + " >= 0s, or off")
}
}
/**
* @deprecated Specific to [[akka.cluster.AutoDown]] should not be used anywhere else, instead
* ``Cluster.downingProvider.downRemovalMargin`` should be used as it allows the downing provider to decide removal
* margins
*/
@deprecated("Use Cluster.downingProvider.downRemovalMargin", since = "2.4.5")
val DownRemovalMargin: FiniteDuration = { val DownRemovalMargin: FiniteDuration = {
val key = "down-removal-margin" val key = "down-removal-margin"
toRootLowerCase(cc.getString(key)) match { toRootLowerCase(cc.getString(key)) match {
@ -142,7 +127,6 @@ final class ClusterSettings(val config: Config, val systemName: String) {
val DowningProviderClassName: String = { val DowningProviderClassName: String = {
val name = cc.getString("downing-provider-class") val name = cc.getString("downing-provider-class")
if (name.nonEmpty) name if (name.nonEmpty) name
else if (AutoDownUnreachableAfter.isFinite) classOf[AutoDowning].getName
else classOf[NoDowning].getName else classOf[NoDowning].getName
} }

View file

@ -6,7 +6,6 @@ package akka.cluster
import akka.ConfigurationException import akka.ConfigurationException
import akka.actor.{ ActorSystem, ExtendedActorSystem, Props } import akka.actor.{ ActorSystem, ExtendedActorSystem, Props }
import com.github.ghik.silencer.silent
import scala.concurrent.duration.FiniteDuration import scala.concurrent.duration.FiniteDuration
@ -35,6 +34,15 @@ private[cluster] object DowningProvider {
/** /**
* API for plugins that will handle downing of cluster nodes. Concrete plugins must subclass and * API for plugins that will handle downing of cluster nodes. Concrete plugins must subclass and
* have a public one argument constructor accepting an [[akka.actor.ActorSystem]]. * have a public one argument constructor accepting an [[akka.actor.ActorSystem]].
*
* A custom `DowningProvider` can be configured with `akka.cluster.downing-provider-class`
*
* When implementing a downing provider you should make sure that it will not split the cluster into
* several separate clusters in case of network problems or system overload (long GC pauses). This
* is much more difficult than it might be perceived at first, so carefully read the concerns and scenarios
* described in
* https://doc.akka.io/docs/akka/current/typed/cluster.html#downing and
* https://doc.akka.io/docs/akka-enhancements/current/split-brain-resolver.html
*/ */
abstract class DowningProvider { abstract class DowningProvider {
@ -61,11 +69,9 @@ abstract class DowningProvider {
} }
/** /**
* Default downing provider used when no provider is configured and 'auto-down-unreachable-after' * Default downing provider used when no provider is configured.
* is not enabled.
*/ */
final class NoDowning(system: ActorSystem) extends DowningProvider { final class NoDowning(system: ActorSystem) extends DowningProvider {
@silent("deprecated")
override def downRemovalMargin: FiniteDuration = Cluster(system).settings.DownRemovalMargin override def downRemovalMargin: FiniteDuration = Cluster(system).settings.DownRemovalMargin
override val downingActorProps: Option[Props] = None override val downingActorProps: Option[Props] = None
} }

View file

@ -22,7 +22,8 @@ object LeaderDowningAllOtherNodesMultiJvmSpec extends MultiNodeConfig {
debugConfig(on = false) debugConfig(on = false)
.withFallback(ConfigFactory.parseString(""" .withFallback(ConfigFactory.parseString("""
akka.cluster.failure-detector.monitored-by-nr-of-members = 2 akka.cluster.failure-detector.monitored-by-nr-of-members = 2
akka.cluster.auto-down-unreachable-after = 1s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 1s
""")) """))
.withFallback(MultiNodeClusterSpec.clusterConfig)) .withFallback(MultiNodeClusterSpec.clusterConfig))
} }

View file

@ -21,7 +21,9 @@ final case class LeaderDowningNodeThatIsUnreachableMultiNodeConfig(failureDetect
commonConfig( commonConfig(
debugConfig(on = false) debugConfig(on = false)
.withFallback(ConfigFactory.parseString("akka.cluster.auto-down-unreachable-after = 2s")) .withFallback(ConfigFactory.parseString("""
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 2s"""))
.withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet))) .withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
} }

View file

@ -21,7 +21,9 @@ object LeaderLeavingMultiJvmSpec extends MultiNodeConfig {
commonConfig( commonConfig(
debugConfig(on = false) debugConfig(on = false)
.withFallback(ConfigFactory.parseString("akka.cluster.auto-down-unreachable-after = 0s")) .withFallback(ConfigFactory.parseString("""
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s"""))
.withFallback(MultiNodeClusterSpec.clusterConfigWithFailureDetectorPuppet)) .withFallback(MultiNodeClusterSpec.clusterConfigWithFailureDetectorPuppet))
} }

View file

@ -39,7 +39,8 @@ object MultiDcSplitBrainMultiJvmSpec extends MultiNodeConfig {
akka.cluster { akka.cluster {
gossip-interval = 500ms gossip-interval = 500ms
leader-actions-interval = 1s leader-actions-interval = 1s
auto-down-unreachable-after = 1s downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 1s
} }
""") """)
.withFallback(MultiNodeClusterSpec.clusterConfig)) .withFallback(MultiNodeClusterSpec.clusterConfig))

View file

@ -21,8 +21,10 @@ object NodeChurnMultiJvmSpec extends MultiNodeConfig {
val third = role("third") val third = role("third")
commonConfig( commonConfig(
debugConfig(on = false).withFallback(ConfigFactory.parseString(""" debugConfig(on = false)
akka.cluster.auto-down-unreachable-after = 1s .withFallback(ConfigFactory.parseString("""
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 1s
akka.cluster.prune-gossip-tombstones-after = 1s akka.cluster.prune-gossip-tombstones-after = 1s
akka.remote.classic.log-frame-size-exceeding = 1200b akka.remote.classic.log-frame-size-exceeding = 1200b
akka.remote.artery.advanced.aeron { akka.remote.artery.advanced.aeron {
@ -30,7 +32,8 @@ object NodeChurnMultiJvmSpec extends MultiNodeConfig {
embedded-media-driver = off embedded-media-driver = off
aeron-dir = "target/aeron-NodeChurnSpec" aeron-dir = "target/aeron-NodeChurnSpec"
} }
""")).withFallback(MultiNodeClusterSpec.clusterConfig)) """))
.withFallback(MultiNodeClusterSpec.clusterConfig))
class LogListener(testActor: ActorRef) extends Actor { class LogListener(testActor: ActorRef) extends Actor {
def receive = { def receive = {

View file

@ -18,7 +18,7 @@ object NodeDowningAndBeingRemovedMultiJvmSpec extends MultiNodeConfig {
commonConfig( commonConfig(
debugConfig(on = false).withFallback( debugConfig(on = false).withFallback(
ConfigFactory ConfigFactory
.parseString("akka.cluster.auto-down-unreachable-after = off") .parseString("akka.cluster.testkit.auto-down-unreachable-after = off")
.withFallback(MultiNodeClusterSpec.clusterConfig))) .withFallback(MultiNodeClusterSpec.clusterConfig)))
} }

View file

@ -21,10 +21,12 @@ object QuickRestartMultiJvmSpec extends MultiNodeConfig {
val third = role("third") val third = role("third")
commonConfig( commonConfig(
debugConfig(on = false).withFallback(ConfigFactory.parseString(""" debugConfig(on = false)
akka.cluster.auto-down-unreachable-after = off .withFallback(ConfigFactory.parseString("""
akka.cluster.testkit.auto-down-unreachable-after = off
akka.cluster.allow-weakly-up-members = off akka.cluster.allow-weakly-up-members = off
""")).withFallback(MultiNodeClusterSpec.clusterConfig)) """))
.withFallback(MultiNodeClusterSpec.clusterConfig))
} }

View file

@ -28,11 +28,13 @@ object RestartFirstSeedNodeMultiJvmSpec extends MultiNodeConfig {
val seed3 = role("seed3") val seed3 = role("seed3")
commonConfig( commonConfig(
debugConfig(on = false).withFallback(ConfigFactory.parseString(""" debugConfig(on = false)
akka.cluster.auto-down-unreachable-after = off .withFallback(ConfigFactory.parseString("""
akka.cluster.testkit.auto-down-unreachable-after = off
akka.cluster.retry-unsuccessful-join-after = 3s akka.cluster.retry-unsuccessful-join-after = 3s
akka.cluster.allow-weakly-up-members = off akka.cluster.allow-weakly-up-members = off
""")).withFallback(MultiNodeClusterSpec.clusterConfig)) """))
.withFallback(MultiNodeClusterSpec.clusterConfig))
} }
class RestartFirstSeedNodeMultiJvmNode1 extends RestartFirstSeedNodeSpec class RestartFirstSeedNodeMultiJvmNode1 extends RestartFirstSeedNodeSpec

View file

@ -28,7 +28,8 @@ object RestartNode2SpecMultiJvmSpec extends MultiNodeConfig {
commonConfig( commonConfig(
debugConfig(on = false) debugConfig(on = false)
.withFallback(ConfigFactory.parseString(""" .withFallback(ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = 2s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 2s
akka.cluster.retry-unsuccessful-join-after = 3s akka.cluster.retry-unsuccessful-join-after = 3s
akka.cluster.allow-weakly-up-members = off akka.cluster.allow-weakly-up-members = off
akka.remote.retry-gate-closed-for = 45s akka.remote.retry-gate-closed-for = 45s

View file

@ -29,7 +29,7 @@ object RestartNode3MultiJvmSpec extends MultiNodeConfig {
commonConfig( commonConfig(
debugConfig(on = false) debugConfig(on = false)
.withFallback(ConfigFactory.parseString(""" .withFallback(ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = off akka.cluster.testkit.auto-down-unreachable-after = off
akka.cluster.allow-weakly-up-members = off akka.cluster.allow-weakly-up-members = off
# test is using Java serialization and not priority to rewrite # test is using Java serialization and not priority to rewrite
akka.actor.allow-java-serialization = on akka.actor.allow-java-serialization = on

View file

@ -34,7 +34,8 @@ object RestartNodeMultiJvmSpec extends MultiNodeConfig {
commonConfig( commonConfig(
debugConfig(on = false) debugConfig(on = false)
.withFallback(ConfigFactory.parseString(""" .withFallback(ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = 5s akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 5s
akka.cluster.allow-weakly-up-members = off akka.cluster.allow-weakly-up-members = off
#akka.remote.use-passive-connections = off #akka.remote.use-passive-connections = off
# test is using Java serialization and not priority to rewrite # test is using Java serialization and not priority to rewrite

View file

@ -16,12 +16,16 @@ final case class SingletonClusterMultiNodeConfig(failureDetectorPuppet: Boolean)
val first = role("first") val first = role("first")
val second = role("second") val second = role("second")
commonConfig(debugConfig(on = false).withFallback(ConfigFactory.parseString(""" commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster { akka.cluster {
auto-down-unreachable-after = 0s downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 0s
failure-detector.threshold = 4 failure-detector.threshold = 4
} }
""")).withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet))) """))
.withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
} }

View file

@ -21,12 +21,16 @@ final case class SplitBrainMultiNodeConfig(failureDetectorPuppet: Boolean) exten
val fourth = role("fourth") val fourth = role("fourth")
val fifth = role("fifth") val fifth = role("fifth")
commonConfig(debugConfig(on = false).withFallback(ConfigFactory.parseString(""" commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.remote.retry-gate-closed-for = 3 s akka.remote.retry-gate-closed-for = 3 s
akka.cluster { akka.cluster {
auto-down-unreachable-after = 1s downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 1s
failure-detector.threshold = 4 failure-detector.threshold = 4
}""")).withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet))) }"""))
.withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
testTransport(on = true) testTransport(on = true)
} }

View file

@ -34,10 +34,14 @@ object StreamRefSpec extends MultiNodeConfig {
val second = role("second") val second = role("second")
val third = role("third") val third = role("third")
commonConfig(debugConfig(on = false).withFallback(ConfigFactory.parseString(""" commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster { akka.cluster {
auto-down-unreachable-after = 1s downing-provider-class = akka.cluster.testkit.AutoDowning
}""")).withFallback(MultiNodeClusterSpec.clusterConfig)) testkit.auto-down-unreachable-after = 1s
}"""))
.withFallback(MultiNodeClusterSpec.clusterConfig))
testTransport(on = true) testTransport(on = true)

View file

@ -119,7 +119,8 @@ private[cluster] object StressMultiJvmSpec extends MultiNodeConfig {
akka.actor.provider = cluster akka.actor.provider = cluster
akka.cluster { akka.cluster {
failure-detector.acceptable-heartbeat-pause = 10s failure-detector.acceptable-heartbeat-pause = 10s
auto-down-unreachable-after = 1s downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 1s
publish-stats-interval = 1s publish-stats-interval = 1s
} }
akka.loggers = ["akka.testkit.TestEventListener"] akka.loggers = ["akka.testkit.TestEventListener"]

View file

@ -42,7 +42,6 @@ class ClusterConfigSpec extends AkkaSpec {
LeaderActionsInterval should ===(1 second) LeaderActionsInterval should ===(1 second)
UnreachableNodesReaperInterval should ===(1 second) UnreachableNodesReaperInterval should ===(1 second)
PublishStatsInterval should ===(Duration.Undefined) PublishStatsInterval should ===(Duration.Undefined)
AutoDownUnreachableAfter should ===(Duration.Undefined)
DownRemovalMargin should ===(Duration.Zero) DownRemovalMargin should ===(Duration.Zero)
MinNrOfMembers should ===(1) MinNrOfMembers should ===(1)
MinNrOfMembersOfRole should ===(Map.empty[String, Int]) MinNrOfMembersOfRole should ===(Map.empty[String, Int])

View file

@ -11,7 +11,8 @@ import com.typesafe.config.{ Config, ConfigFactory }
object ClusterLogSpec { object ClusterLogSpec {
val config = """ val config = """
akka.cluster { akka.cluster {
auto-down-unreachable-after = 0s downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 0s
publish-stats-interval = 0 s # always, when it happens publish-stats-interval = 0 s # always, when it happens
failure-detector.implementation-class = akka.cluster.FailureDetectorPuppet failure-detector.implementation-class = akka.cluster.FailureDetectorPuppet
} }

View file

@ -30,7 +30,8 @@ import scala.concurrent.duration._
object ClusterSpec { object ClusterSpec {
val config = """ val config = """
akka.cluster { akka.cluster {
auto-down-unreachable-after = 0s downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 0s
periodic-tasks-initial-delay = 120 seconds // turn off scheduled tasks periodic-tasks-initial-delay = 120 seconds // turn off scheduled tasks
publish-stats-interval = 0 s # always, when it happens publish-stats-interval = 0 s # always, when it happens
failure-detector.implementation-class = akka.cluster.FailureDetectorPuppet failure-detector.implementation-class = akka.cluster.FailureDetectorPuppet

View file

@ -6,14 +6,17 @@ package akka.cluster
import java.util.concurrent.atomic.AtomicBoolean import java.util.concurrent.atomic.AtomicBoolean
import scala.concurrent.duration._
import akka.ConfigurationException import akka.ConfigurationException
import akka.actor.{ ActorSystem, Props } import akka.actor.ActorSystem
import akka.testkit.TestKit.{ awaitCond, shutdownActorSystem } import akka.actor.Props
import akka.testkit.TestKit.awaitCond
import akka.testkit.TestKit.shutdownActorSystem
import akka.util.unused import akka.util.unused
import com.typesafe.config.ConfigFactory import com.typesafe.config.ConfigFactory
import org.scalatest.{ Matchers, WordSpec } import org.scalatest.Matchers
import org.scalatest.WordSpec
import scala.concurrent.duration._
class FailingDowningProvider(@unused system: ActorSystem) extends DowningProvider { class FailingDowningProvider(@unused system: ActorSystem) extends DowningProvider {
override val downRemovalMargin: FiniteDuration = 20.seconds override val downRemovalMargin: FiniteDuration = 20.seconds
@ -39,6 +42,10 @@ class DowningProviderSpec extends WordSpec with Matchers {
loglevel = WARNING loglevel = WARNING
actor.provider = "cluster" actor.provider = "cluster"
remote { remote {
artery.canonical {
hostname = 127.0.0.1
port = 0
}
classic.netty.tcp { classic.netty.tcp {
hostname = "127.0.0.1" hostname = "127.0.0.1"
port = 0 port = 0
@ -55,16 +62,6 @@ class DowningProviderSpec extends WordSpec with Matchers {
shutdownActorSystem(system) shutdownActorSystem(system)
} }
"use akka.cluster.AutoDowning if 'auto-down-unreachable-after' is configured" in {
val system = ActorSystem(
"auto-downing",
ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = 18d
""").withFallback(baseConf))
Cluster(system).downingProvider shouldBe an[AutoDowning]
shutdownActorSystem(system)
}
"use the specified downing provider" in { "use the specified downing provider" in {
val system = ActorSystem( val system = ActorSystem(
"auto-downing", "auto-downing",

View file

@ -259,7 +259,8 @@ class JoinConfigCompatCheckerSpec extends AkkaSpec with ClusterTestKit {
akka.cluster { akka.cluster {
# using explicit downing provider class # using explicit downing provider class
downing-provider-class = "akka.cluster.AutoDowning" downing-provider-class = "akka.cluster.testkit.AutoDowning"
testkit.auto-down-unreachable-after = 0s
configuration-compatibility-check { configuration-compatibility-check {
enforce-on-join = on enforce-on-join = on

View file

@ -2,17 +2,82 @@
* Copyright (C) 2009-2019 Lightbend Inc. <https://www.lightbend.com> * Copyright (C) 2009-2019 Lightbend Inc. <https://www.lightbend.com>
*/ */
package akka.cluster package akka.cluster.testkit
import akka.ConfigurationException
import akka.actor.{ Actor, ActorSystem, Address, Cancellable, Props, Scheduler }
import scala.concurrent.duration.FiniteDuration
import akka.cluster.ClusterEvent._
import scala.concurrent.duration.Duration import scala.concurrent.duration.Duration
import scala.concurrent.duration.FiniteDuration
import akka.actor.Actor
import akka.actor.ActorLogging import akka.actor.ActorLogging
import com.github.ghik.silencer.silent import akka.actor.ActorSystem
import akka.actor.Address
import akka.actor.Cancellable
import akka.actor.Props
import akka.actor.Scheduler
import akka.cluster.Cluster
import akka.cluster.ClusterEvent._
import akka.cluster.DowningProvider
import akka.cluster.Member
import akka.cluster.MembershipState
import akka.cluster.UniqueAddress
import akka.util.Helpers.ConfigOps
import akka.util.Helpers.Requiring
import akka.util.Helpers.toRootLowerCase
/**
* Downing provider used for testing.
*
* Auto-downing is a naïve approach to remove unreachable nodes from the cluster membership.
* In a production environment it will eventually break down the cluster.
* When a network partition occurs, both sides of the partition will see the other side as unreachable
* and remove it from the cluster. This results in the formation of two separate, disconnected, clusters
* (known as *Split Brain*).
*
* This behavior is not limited to network partitions. It can also occur if a node in the cluster is
* overloaded, or experiences a long GC pause.
*
* When using Cluster Singleton or Cluster Sharding it can break the contract provided by those features.
* Both provide a guarantee that an actor will be unique in a cluster.
* With the auto-down feature enabled, it is possible for multiple independent clusters to form (*Split Brain*).
* When this happens the guaranteed uniqueness will no longer be true resulting in undesirable behavior
* in the system.
*
* This is even more severe when Akka Persistence is used in conjunction with Cluster Sharding.
* In this case, the lack of unique actors can cause multiple actors to write to the same journal.
* Akka Persistence operates on a single writer principle. Having multiple writers will corrupt
* the journal and make it unusable.
*
* Finally, even if you don't use features such as Persistence, Sharding, or Singletons, auto-downing can lead the
* system to form multiple small clusters. These small clusters will be independent from each other. They will be
* unable to communicate and as a result you may experience performance degradation. Once this condition occurs,
* it will require manual intervention in order to reform the cluster.
*
* Because of these issues, auto-downing should never be used in a production environment.
*/
final class AutoDowning(system: ActorSystem) extends DowningProvider {
private def clusterSettings = Cluster(system).settings
private val AutoDownUnreachableAfter: Duration = {
val key = "akka.cluster.testkit.auto-down-unreachable-after"
// it's not in reference.conf, since only used in tests
if (clusterSettings.config.hasPath(key)) {
toRootLowerCase(clusterSettings.config.getString(key)) match {
case "off" => Duration.Undefined
case _ => clusterSettings.config.getMillisDuration(key).requiring(_ >= Duration.Zero, key + " >= 0s, or off")
}
} else
Duration.Undefined
}
override def downRemovalMargin: FiniteDuration = clusterSettings.DownRemovalMargin
override def downingActorProps: Option[Props] =
AutoDownUnreachableAfter match {
case d: FiniteDuration => Some(AutoDown.props(d))
case _ => None // auto-down-unreachable-after = off
}
}
/** /**
* INTERNAL API * INTERNAL API
@ -25,26 +90,6 @@ private[cluster] object AutoDown {
final case class UnreachableTimeout(node: UniqueAddress) final case class UnreachableTimeout(node: UniqueAddress)
} }
/**
* Used when no custom provider is configured and 'auto-down-unreachable-after' is enabled.
*/
final class AutoDowning(system: ActorSystem) extends DowningProvider {
private def clusterSettings = Cluster(system).settings
@silent("deprecated")
override def downRemovalMargin: FiniteDuration = clusterSettings.DownRemovalMargin
override def downingActorProps: Option[Props] =
clusterSettings.AutoDownUnreachableAfter match {
case d: FiniteDuration => Some(AutoDown.props(d))
case _ =>
// I don't think this can actually happen
throw new ConfigurationException(
"AutoDowning downing provider selected but 'akka.cluster.auto-down-unreachable-after' not set")
}
}
/** /**
* INTERNAL API * INTERNAL API
* *
@ -68,9 +113,7 @@ private[cluster] class AutoDown(autoDownUnreachableAfter: FiniteDuration)
// re-subscribe when restart // re-subscribe when restart
override def preStart(): Unit = { override def preStart(): Unit = {
log.warning( log.debug("Auto-down is enabled in test.")
"Don't use auto-down feature of Akka Cluster in production. " +
"See 'Auto-downing (DO NOT USE)' section of Akka Cluster documentation.")
cluster.subscribe(self, classOf[ClusterDomainEvent]) cluster.subscribe(self, classOf[ClusterDomainEvent])
super.preStart() super.preStart()
} }
@ -81,11 +124,7 @@ private[cluster] class AutoDown(autoDownUnreachableAfter: FiniteDuration)
override def down(node: Address): Unit = { override def down(node: Address): Unit = {
require(leader) require(leader)
logInfo( logInfo("Leader is auto-downing unreachable node [{}].", node)
"Leader is auto-downing unreachable node [{}]. " +
"Don't use auto-down feature of Akka Cluster in production. " +
"See 'Auto-downing (DO NOT USE)' section of Akka Cluster documentation.",
node)
cluster.down(node) cluster.down(node)
} }

View file

@ -2,15 +2,18 @@
* Copyright (C) 2009-2019 Lightbend Inc. <https://www.lightbend.com> * Copyright (C) 2009-2019 Lightbend Inc. <https://www.lightbend.com>
*/ */
package akka.cluster package akka.cluster.testkit
import scala.concurrent.duration._ import scala.concurrent.duration._
import akka.actor.Address
import akka.actor.Scheduler
import akka.actor.ActorRef import akka.actor.ActorRef
import akka.actor.Address
import akka.actor.Props import akka.actor.Props
import akka.cluster.MemberStatus._ import akka.actor.Scheduler
import akka.cluster.ClusterEvent._ import akka.cluster.ClusterEvent._
import akka.cluster.Member
import akka.cluster.MemberStatus._
import akka.cluster.TestMember
import akka.remote.RARP import akka.remote.RARP
import akka.testkit.AkkaSpec import akka.testkit.AkkaSpec
import akka.testkit.TimingTest import akka.testkit.TimingTest

View file

@ -32,7 +32,8 @@ object LotsOfDataBot {
// Override the configuration of the port // Override the configuration of the port
val config = ConfigFactory val config = ConfigFactory
.parseString("akka.remote.classic.netty.tcp.port=" + port) .parseString("akka.remote.classic.netty.tcp.port=" + port)
.withFallback(ConfigFactory.load(ConfigFactory.parseString(""" .withFallback(
ConfigFactory.load(ConfigFactory.parseString("""
passive = off passive = off
max-entries = 100000 max-entries = 100000
akka.actor.provider = "cluster" akka.actor.provider = "cluster"
@ -48,7 +49,8 @@ object LotsOfDataBot {
"akka://ClusterSystem@127.0.0.1:2551", "akka://ClusterSystem@127.0.0.1:2551",
"akka://ClusterSystem@127.0.0.1:2552"] "akka://ClusterSystem@127.0.0.1:2552"]
auto-down-unreachable-after = 10s downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 10s
} }
"""))) """)))

View file

@ -227,14 +227,6 @@ graceful leaving process of a cluster member.
See @ref:[removal of Internal Cluster Sharding Data](typed/cluster-sharding.md#removal-of-internal-cluster-sharding-data) in the documentation of the new APIs. See @ref:[removal of Internal Cluster Sharding Data](typed/cluster-sharding.md#removal-of-internal-cluster-sharding-data) in the documentation of the new APIs.
## Configuration
`ClusterShardingSettings` is a parameter to the `start` method of
the `ClusterSharding` extension, i.e. each each entity type can be configured with different settings
if needed.
See @ref:[configuration](typed/cluster-sharding.md#configuration) for more information.
## Inspecting cluster sharding state ## Inspecting cluster sharding state
Two requests to inspect the cluster state are available: Two requests to inspect the cluster state are available:
@ -256,20 +248,13 @@ directly sending messages to the individual entities.
## Lease ## Lease
A @ref[lease](coordination.md) can be used as an additional safety measure to ensure a shard A lease can be used as an additional safety measure to ensure a shard does not run on two nodes.
does not run on two nodes. See @ref:[Lease](typed/cluster-sharding.md#lease) in the documentation of the new APIs.
Reasons for how this can happen: ## Configuration
* Network partitions without an appropriate downing provider `ClusterShardingSettings` is a parameter to the `start` method of
* Mistakes in the deployment process leading to two separate Akka Clusters the `ClusterSharding` extension, i.e. each each entity type can be configured with different settings
* Timing issues between removing members from the Cluster on one side of a network partition and shutting them down on the other side if needed.
A lease can be a final backup that means that each shard won't create child entity actors unless it has the lease. See @ref:[configuration](typed/cluster-sharding.md#configuration) for more information.
To use a lease for sharding set `akka.cluster.sharding.use-lease` to the configuration location
of the lease to use. Each shard will try and acquire a lease with with the name `<actor system name>-shard-<type name>-<shard id>` and
the owner is set to the `Cluster(system).selfAddress.hostPort`.
If a shard can't acquire a lease it will remain uninitialized so messages for entities it owns will
be buffered in the `ShardRegion`. If the lease is lost after initialization the Shard will be terminated.

View file

@ -104,6 +104,14 @@ Scala
Java Java
: @@snip [SimpleClusterListener2.java](/akka-docs/src/test/java/jdocs/cluster/SimpleClusterListener2.java) { #join } : @@snip [SimpleClusterListener2.java](/akka-docs/src/test/java/jdocs/cluster/SimpleClusterListener2.java) { #join }
## Leaving
See @ref:[Leaving](typed/cluster.md#leaving) in the documentation of the new APIs.
## Downing
See @ref:[Downing](typed/cluster.md#downing) in the documentation of the new APIs.
<a id="cluster-subscriber"></a> <a id="cluster-subscriber"></a>
## Subscribe to Cluster Events ## Subscribe to Cluster Events

View file

@ -3,7 +3,7 @@
## Commercial Support ## Commercial Support
Commercial support is provided by [Lightbend](http://www.lightbend.com). Commercial support is provided by [Lightbend](http://www.lightbend.com).
Akka is part of the [Lightbend Reactive Platform](http://www.lightbend.com/platform). Akka is part of the [Lightbend Platform](http://www.lightbend.com/platform).
## Sponsors ## Sponsors

View file

@ -11,6 +11,40 @@ is [no longer available as a static method](https://github.com/scala/bug/issues/
If you are still using Scala 2.11 then you must upgrade to 2.12 or 2.13 If you are still using Scala 2.11 then you must upgrade to 2.12 or 2.13
## Auto-downing removed
Auto-downing of unreachable Cluster members have been removed after warnings and recommendations against using it
for many years. It was by default disabled, but could be enabled with configuration
`akka.cluster.auto-down-unreachable-after`.
For alternatives see the @ref:[documentation about Downing](../typed/cluster.md#downing).
Auto-downing was a naïve approach to remove unreachable nodes from the cluster membership.
In a production environment it will eventually break down the cluster.
When a network partition occurs, both sides of the partition will see the other side as unreachable
and remove it from the cluster. This results in the formation of two separate, disconnected, clusters
(known as *Split Brain*).
This behavior is not limited to network partitions. It can also occur if a node in the cluster is
overloaded, or experiences a long GC pause.
When using @ref:[Cluster Singleton](../typed/cluster-singleton.md) or @ref:[Cluster Sharding](../typed/cluster-sharding.md)
it can break the contract provided by those features. Both provide a guarantee that an actor will be unique in a cluster.
With the auto-down feature enabled, it is possible for multiple independent clusters to form (*Split Brain*).
When this happens the guaranteed uniqueness will no longer be true resulting in undesirable behavior in the system.
This is even more severe when @ref:[Akka Persistence](../typed/persistence.md) is used in conjunction with
Cluster Sharding. In this case, the lack of unique actors can cause multiple actors to write to the same journal.
Akka Persistence operates on a single writer principle. Having multiple writers will corrupt the journal
and make it unusable.
Finally, even if you don't use features such as Persistence, Sharding, or Singletons, auto-downing can lead the
system to form multiple small clusters. These small clusters will be independent from each other. They will be
unable to communicate and as a result you may experience performance degradation. Once this condition occurs,
it will require manual intervention in order to reform the cluster.
Because of these issues, auto-downing should **never** be used in a production environment.
## Removed features that were deprecated ## Removed features that were deprecated
After being deprecated since 2.5.0, the following have been removed in Akka 2.6. After being deprecated since 2.5.0, the following have been removed in Akka 2.6.
@ -94,13 +128,25 @@ to make remote interactions look like local method calls.
Warnings about `TypedActor` have been [mentioned in documentation](https://doc.akka.io/docs/akka/2.5/typed-actors.html#when-to-use-typed-actors) Warnings about `TypedActor` have been [mentioned in documentation](https://doc.akka.io/docs/akka/2.5/typed-actors.html#when-to-use-typed-actors)
for many years. for many years.
### akka-protobuf
`akka-protobuf` was never intended to be used by end users but perhaps this was not well-documented.
Applications should use standard Protobuf dependency instead of `akka-protobuf`. The artifact is still
published, but the transitive dependency to `akka-protobuf` has been removed.
Akka is now using Protobuf version 3.9.0 for serialization of messages defined by Akka.
### Cluster Client
Cluster client has been deprecated as of 2.6 in favor of [Akka gRPC](https://doc.akka.io/docs/akka-grpc/current/index.html).
It is not advised to build new applications with Cluster client, and existing users @ref[should migrate to Akka gRPC](../cluster-client.md#migration-to-akka-grpc).
### akka.Main ### akka.Main
`akka.Main` is deprecated in favour of starting the `ActorSystem` from a custom main class instead. `akka.Main` was not `akka.Main` is deprecated in favour of starting the `ActorSystem` from a custom main class instead. `akka.Main` was not
adding much value and typically a custom main class is needed anyway. adding much value and typically a custom main class is needed anyway.
@@ Remoting ## Remoting
### Default remoting is now Artery TCP ### Default remoting is now Artery TCP
@ -184,20 +230,7 @@ For TCP:
Classic remoting is deprecated but can be used in `2.6.` Explicitly disable Artery by setting property `akka.remote.artery.enabled` to `false`. Further, any configuration under `akka.remote` that is Classic remoting is deprecated but can be used in `2.6.` Explicitly disable Artery by setting property `akka.remote.artery.enabled` to `false`. Further, any configuration under `akka.remote` that is
specific to classic remoting needs to be moved to `akka.remote.classic`. To see which configuration options specific to classic remoting needs to be moved to `akka.remote.classic`. To see which configuration options
are specific to classic search for them in: [`akka-remote/reference.conf`](/akka-remote/src/main/resources/reference.conf) are specific to classic search for them in: @ref:[`akka-remote/reference.conf`](../general/configuration.md#config-akka-remote).
### akka-protobuf
`akka-protobuf` was never intended to be used by end users but perhaps this was not well-documented.
Applications should use standard Protobuf dependency instead of `akka-protobuf`. The artifact is still
published, but the transitive dependency to `akka-protobuf` has been removed.
Akka is now using Protobuf version 3.9.0 for serialization of messages defined by Akka.
### Cluster Client
Cluster client has been deprecated as of 2.6 in favor of [Akka gRPC](https://doc.akka.io/docs/akka-grpc/current/index.html).
It is not advised to build new applications with Cluster client, and existing users @ref[should migrate to Akka gRPC](../cluster-client.md#migration-to-akka-grpc).
## Java Serialization ## Java Serialization
@ -235,14 +268,12 @@ handling that type and it was previously "accidentally" serialized with Java ser
The following documents configuration changes and behavior changes where no action is required. In some cases the old The following documents configuration changes and behavior changes where no action is required. In some cases the old
behavior can be restored via configuration. behavior can be restored via configuration.
### Remoting ### Remoting dependencies have been made optional
#### Remoting dependencies have been made optional
Classic remoting depends on Netty and Artery UDP depends on Aeron. These are now both optional dependencies that need Classic remoting depends on Netty and Artery UDP depends on Aeron. These are now both optional dependencies that need
to be explicitly added. See @ref[classic remoting](../remoting.md) or @ref[artery remoting](../remoting-artery.md) for instructions. to be explicitly added. See @ref[classic remoting](../remoting.md) or @ref[artery remoting](../remoting-artery.md) for instructions.
#### Remote watch and deployment have been disabled without Cluster use ### Remote watch and deployment have been disabled without Cluster use
By default, these remoting features are disabled when not using Akka Cluster: By default, these remoting features are disabled when not using Akka Cluster:

View file

@ -43,10 +43,10 @@ if that feature is enabled.
@@@ warning @@@ warning
**Don't use Cluster Sharding together with Automatic Downing**, Make sure to not use a Cluster downing strategy that may split the cluster into several separate clusters in
since it allows the cluster to split up into two separate clusters, which in turn will result case of network problems or system overload (long GC pauses), since that will result in *multiple shards and entities*
in *multiple shards and entities* being started, one in each separate cluster! being started, one in each separate cluster!
See @ref:[Downing](cluster.md#automatic-vs-manual-downing). See @ref:[Downing](cluster.md#downing).
@@@ @@@
@ -304,6 +304,26 @@ rebalanced to other nodes.
See @ref:[How To Startup when Cluster Size Reached](cluster.md#how-to-startup-when-a-cluster-size-is-reached) See @ref:[How To Startup when Cluster Size Reached](cluster.md#how-to-startup-when-a-cluster-size-is-reached)
for more information about `min-nr-of-members`. for more information about `min-nr-of-members`.
## Lease
A @ref[lease](../coordination.md) can be used as an additional safety measure to ensure a shard
does not run on two nodes.
Reasons for how this can happen:
* Network partitions without an appropriate downing provider
* Mistakes in the deployment process leading to two separate Akka Clusters
* Timing issues between removing members from the Cluster on one side of a network partition and shutting them down on the other side
A lease can be a final backup that means that each shard won't create child entity actors unless it has the lease.
To use a lease for sharding set `akka.cluster.sharding.use-lease` to the configuration location
of the lease to use. Each shard will try and acquire a lease with with the name `<actor system name>-shard-<type name>-<shard id>` and
the owner is set to the `Cluster(system).selfAddress.hostPort`.
If a shard can't acquire a lease it will remain uninitialized so messages for entities it owns will
be buffered in the `ShardRegion`. If the lease is lost after initialization the Shard will be terminated.
## Removal of internal Cluster Sharding data ## Removal of internal Cluster Sharding data
Removal of internal Cluster Sharding data is only relevant for "Persistent Mode". Removal of internal Cluster Sharding data is only relevant for "Persistent Mode".
@ -326,15 +346,6 @@ cannot startup because of corrupt data, which may happen if accidentally
two clusters were running at the same time, e.g. caused by using auto-down two clusters were running at the same time, e.g. caused by using auto-down
and there was a network partition. and there was a network partition.
@@@ warning
**Don't use Cluster Sharding together with Automatic Downing**,
since it allows the cluster to split up into two separate clusters, which in turn will result
in *multiple shards and entities* being started, one in each separate cluster!
See @ref:[Downing](cluster.md#automatic-vs-manual-downing).
@@@
Use this program as a standalone Java main program: Use this program as a standalone Java main program:
``` ```
@ -347,7 +358,7 @@ The program is included in the `akka-cluster-sharding` jar file. It
is easiest to run it with same classpath and configuration as your ordinary is easiest to run it with same classpath and configuration as your ordinary
application. It can be run from sbt or Maven in similar way. application. It can be run from sbt or Maven in similar way.
Specify the entity type names (same as you use in the `start` method Specify the entity type names (same as you use in the `init` method
of `ClusterSharding`) as program arguments. of `ClusterSharding`) as program arguments.
If you specify `-2.3` as the first program argument it will also try If you specify `-2.3` as the first program argument it will also try

View file

@ -32,6 +32,15 @@ such as single-point of bottleneck. Single-point of failure is also a relevant c
but for some cases this feature takes care of that by making sure that another singleton but for some cases this feature takes care of that by making sure that another singleton
instance will eventually be started. instance will eventually be started.
@@@ warning
Make sure to not use a Cluster downing strategy that may split the cluster into several separate clusters in
case of network problems or system overload (long GC pauses), since that will result in in *multiple Singletons*
being started, one in each separate cluster!
See @ref:[Downing](cluster.md#downing).
@@@
### Singleton manager ### Singleton manager
The cluster singleton pattern manages one singleton actor instance among all cluster nodes or a group of nodes tagged with The cluster singleton pattern manages one singleton actor instance among all cluster nodes or a group of nodes tagged with
@ -80,23 +89,20 @@ The singleton instance will not run on members with status @ref:[WeaklyUp](clust
This pattern may seem to be very tempting to use at first, but it has several drawbacks, some of them are listed below: This pattern may seem to be very tempting to use at first, but it has several drawbacks, some of them are listed below:
* the cluster singleton may quickly become a *performance bottleneck*, * The cluster singleton may quickly become a *performance bottleneck*.
* you can not rely on the cluster singleton to be *non-stop* available — e.g. when the node on which the singleton has * You can not rely on the cluster singleton to be *non-stop* available — e.g. when the node on which the singleton
been running dies, it will take a few seconds for this to be noticed and the singleton be migrated to another node, has been running dies, it will take a few seconds for this to be noticed and the singleton be migrated to another node.
* in the case of a *network partition* appearing in a Cluster that is using Automatic Downing (see docs for * If many singletons are used be aware of that all will run on the oldest node (or oldest with configured role).
@ref:[Auto Downing](cluster.md#auto-downing-do-not-use), @ref:[Cluster Sharding](cluster-sharding.md) combined with keeping the "singleton" entities alive can be a better
it may happen that the isolated clusters each decide to spin up their own singleton, meaning that there might be multiple alternative.
singletons running in the system, yet the Clusters have no way of finding out about them (because of the partition).
Especially the last point is something you should be aware of — in general when using the Cluster Singleton pattern
you should take care of downing nodes yourself and not rely on the timing based auto-down feature.
@@@ warning @@@ warning
**Don't use Cluster Singleton together with Automatic Downing**, Make sure to not use a Cluster downing strategy that may split the cluster into several separate clusters in
since it allows the cluster to split up into two separate clusters, which in turn will result case of network problems or system overload (long GC pauses), since that will result in in *multiple Singletons*
in *multiple Singletons* being started, one in each separate cluster! being started, one in each separate cluster!
See @ref:[Downing](cluster.md#downing).
@@@ @@@
## Example ## Example

View file

@ -255,95 +255,69 @@ after the restart, when it come up as new incarnation of existing member in the
trying to join in, then the existing one will be removed from the cluster and then it will trying to join in, then the existing one will be removed from the cluster and then it will
be allowed to join. be allowed to join.
<a id="automatic-vs-manual-downing"></a>
### Downing
When a member is considered by the failure detector to be `unreachable` the
leader is not allowed to perform its duties, such as changing status of
new joining members to 'Up'. The node must first become `reachable` again, or the
status of the unreachable member must be changed to 'Down'. Changing status to 'Down'
can be performed automatically or manually. By default it must be done manually, using
@ref:[JMX](../additional/operations.md#jmx) or @ref:[HTTP](../additional/operations.md#http).
It can also be performed programmatically with @scala[`Cluster(system).down(address)`]@java[`Cluster.get(system).down(address)`].
If a node is still running and sees its self as Down it will shutdown. @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will automatically
run if `run-coordinated-shutdown-when-down` is set to `on` (the default) however the node will not try
and leave the cluster gracefully so sharding and singleton migration will not occur.
A production solution for the downing problem is provided by
[Split Brain Resolver](http://developer.lightbend.com/docs/akka-commercial-addons/current/split-brain-resolver.html),
which is part of the [Lightbend Reactive Platform](http://www.lightbend.com/platform).
If you dont use RP, you should anyway carefully read the [documentation](http://developer.lightbend.com/docs/akka-commercial-addons/current/split-brain-resolver.html)
of the Split Brain Resolver and make sure that the solution you are using handles the concerns
described there.
### Auto-downing - DO NOT USE
There is an automatic downing feature that you should not use in production. For testing you can enable it with configuration:
```
akka.cluster.auto-down-unreachable-after = 120s
```
This means that the cluster leader member will change the `unreachable` node
status to `down` automatically after the configured time of unreachability.
This is a naïve approach to remove unreachable nodes from the cluster membership.
It can be useful during development but in a production environment it will eventually breakdown the cluster.
When a network partition occurs, both sides of the partition will see the other side as unreachable and remove it from the cluster.
This results in the formation of two separate, disconnected, clusters (known as *Split Brain*).
This behaviour is not limited to network partitions. It can also occur if a node
in the cluster is overloaded, or experiences a long GC pause.
@@@ warning
We recommend against using the auto-down feature of Akka Cluster in production. It
has multiple undesirable consequences for production systems.
If you are using @ref:[Cluster Singleton](cluster-singleton.md) or @ref:[Cluster Sharding](cluster-sharding.md) it can break the contract provided by
those features. Both provide a guarantee that an actor will be unique in a cluster.
With the auto-down feature enabled, it is possible for multiple independent clusters
to form (*Split Brain*). When this happens the guaranteed uniqueness will no
longer be true resulting in undesirable behaviour in the system.
This is even more severe when @ref:[Akka Persistence](persistence.md) is used in
conjunction with Cluster Sharding. In this case, the lack of unique actors can
cause multiple actors to write to the same journal. Akka Persistence operates on a
single writer principle. Having multiple writers will corrupt the journal
and make it unusable.
Finally, even if you don't use features such as Persistence, Sharding, or Singletons,
auto-downing can lead the system to form multiple small clusters. These small
clusters will be independent from each other. They will be unable to communicate
and as a result you may experience performance degradation. Once this condition
occurs, it will require manual intervention in order to reform the cluster.
Because of these issues, auto-downing should **never** be used in a production environment.
@@@
### Leaving ### Leaving
There are two ways to remove a member from the cluster. There are a few ways to remove a member from the cluster.
1. The recommended way to leave a cluster is a graceful exit, informing the cluster that a node shall leave. 1. The recommended way to leave a cluster is a graceful exit, informing the cluster that a node shall leave.
This can be performed using @ref:[JMX](../additional/operations.md#jmx) or @ref:[HTTP](../additional/operations.md#http). This is performed by @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) when the `ActorSystem`
This method will offer faster hand off to peer nodes during node shutdown. is terminated and also when a SIGTERM is sent from the environment to stop the JVM process.
1. When a graceful exit is not possible, you can stop the actor system (or the JVM process, for example a SIGTERM sent from the environment). It will be detected 1. Graceful exit can also be performed using @ref:[HTTP](../additional/operations.md#http) or @ref:[JMX](../additional/operations.md#jmx).
as unreachable and removed after the automatic or manual downing. 1. When a graceful exit is not possible, for example in case of abrupt termination of the the JVM process, the node
will be detected as unreachable by other nodes and removed after @ref:[Downing](#downing).
The @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will automatically run when the cluster node sees itself as Graceful leaving will offer faster hand off to peer nodes during node shutdown than abrupt termination and downing.
The @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will also run when the cluster node sees itself as
`Exiting`, i.e. leaving from another node will trigger the shutdown process on the leaving node. `Exiting`, i.e. leaving from another node will trigger the shutdown process on the leaving node.
Tasks for graceful leaving of cluster including graceful shutdown of Cluster Singletons and Tasks for graceful leaving of cluster including graceful shutdown of Cluster Singletons and
Cluster Sharding are added automatically when Akka Cluster is used, i.e. running the shutdown Cluster Sharding are added automatically when Akka Cluster is used, i.e. running the shutdown
process will also trigger the graceful leaving if it's not already in progress. process will also trigger the graceful leaving if it's not already in progress.
Normally this is handled automatically, but in case of network failures during this process it might still Normally this is handled automatically, but in case of network failures during this process it might still
be necessary to set the nodes status to `Down` in order to complete the removal. For handling network failures be necessary to set the nodes status to `Down` in order to complete the removal, see @ref:[Downing](#downing).
see [Split Brain Resolver](http://developer.lightbend.com/docs/akka-commercial-addons/current/split-brain-resolver.html),
part of the [Lightbend Reactive Platform](http://www.lightbend.com/platform). ### Downing
In many cases a member can gracefully exit from the cluster as described in @ref:[Leaving](#leaving), but
there are scenarios when an explicit downing decision is needed before it can be removed. For example in case
of abrupt termination of the the JVM process, system overload that doesn't recover, or network partitions
that don't heal. I such cases the node(s) will be detected as unreachable by other nodes, but they must also
be marked as `Down` before they are removed.
When a member is considered by the failure detector to be `unreachable` the
leader is not allowed to perform its duties, such as changing status of
new joining members to 'Up'. The node must first become `reachable` again, or the
status of the unreachable member must be changed to `Down`. Changing status to `Down`
can be performed automatically or manually.
By default, downing must be performed manually using @ref:[HTTP](../additional/operations.md#http) or @ref:[JMX](../additional/operations.md#jmx).
Note that @ref:[Cluster Singleton](cluster-singleton.md) or @ref:[Cluster Sharding entities](cluster-sharding.md) that
are running on a crashed (unreachable) node will not be started on another node until the previous node has
been removed from the Cluster. Removal of crashed (unreachable) nodes is performed after a downing decision.
A production solution for downing is provided by
[Split Brain Resolver](https://doc.akka.io/docs/akka-enhancements/current/split-brain-resolver.html),
which is part of the [Lightbend Platform](http://www.lightbend.com/platform).
If you dont have a Lightbend Platform Subscription, you should still carefully read the
[documentation](https://doc.akka.io/docs/akka-enhancements/current/split-brain-resolver.html)
of the Split Brain Resolver and make sure that the solution you are using handles the concerns and scenarios
described there.
A custom downing strategy can be implemented with a @apidoc[akka.cluster.DowningProvider] and enabled with
configuration `akka.cluster.downing-provider-class`.
Downing can also be performed programmatically with @scala[`Cluster(system).manager ! Down(address)`]@java[`Cluster.get(system).manager().tell(Down(address))`],
but that is mostly useful from tests and when implementing a `DowningProvider`.
If a crashed node is restarted with the same hostname and port and joining the cluster again the previous incarnation
of that member will be downed and removed. The new join attempt with same hostname and port is used as evidence
that the previous is not alive any more.
If a node is still running and sees its self as `Down` it will shutdown. @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will automatically
run if `run-coordinated-shutdown-when-down` is set to `on` (the default) however the node will not try
and leave the cluster gracefully.
## Node Roles ## Node Roles