Remove auto-downing, #27788 (#27855)

* moved to cluster tests, in new package akka.cluster.testkit
* changed config in tests
* migration guide
* documentation clarificiations for Downing and Leaving
* update warnings in Singleton and Sharding
This commit is contained in:
Patrik Nordwall 2019-10-03 14:08:43 +02:00 committed by GitHub
parent 064f06f5a6
commit a217d5566e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
61 changed files with 414 additions and 309 deletions

View file

@ -63,7 +63,8 @@ abstract class ClusterShardingFailureSpecConfig(val mode: String) extends MultiN
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.classic.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.roles = ["backend"]
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"
akka.persistence.journal.leveldb-shared {

View file

@ -50,7 +50,8 @@ object ClusterShardingGetStateSpecConfig extends MultiNodeConfig {
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.sharding {
coordinator-failure-backoff = 3s
shard-failure-backoff = 3s

View file

@ -56,7 +56,8 @@ object ClusterShardingGetStatsSpecConfig extends MultiNodeConfig {
akka.actor.provider = "cluster"
akka.remote.classic.log-remote-lifecycle-events = off
akka.log-dead-letters-during-shutdown = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.sharding {
state-store-mode = "ddata"
updating-state-timeout = 2s

View file

@ -67,7 +67,8 @@ abstract class ClusterShardingLeavingSpecConfig(val mode: String) extends MultiN
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.classic.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"
akka.persistence.journal.leveldb-shared {
timeout = 5s

View file

@ -55,7 +55,8 @@ object ClusterShardingQueriesSpecConfig extends MultiNodeConfig {
akka.actor.provider = "cluster"
akka.remote.classic.log-remote-lifecycle-events = off
akka.log-dead-letters-during-shutdown = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.sharding {
state-store-mode = "ddata"
shard-region-query-timeout = 0ms

View file

@ -61,7 +61,8 @@ abstract class ClusterShardingRememberEntitiesNewExtractorSpecConfig(val mode: S
ConfigFactory
.parseString(s"""
akka.actor.provider = "cluster"
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.remote.classic.log-remote-lifecycle-events = off
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"
akka.persistence.journal.leveldb-shared {

View file

@ -56,7 +56,8 @@ object ClusterShardingRememberEntitiesPerfSpecConfig extends MultiNodeConfig {
commonConfig(ConfigFactory.parseString(s"""
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.remote.log-remote-lifecycle-events = off
akka.testconductor.barrier-timeout = 3 minutes
akka.remote.artery.advanced.outbound-message-queue-size = 10000

View file

@ -69,7 +69,8 @@ abstract class ClusterShardingRememberEntitiesSpecConfig(val mode: String, val r
modeConfig
.withFallback(ConfigFactory.parseString(s"""
akka.actor.provider = "cluster"
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.remote.log-remote-lifecycle-events = off
akka.cluster.sharding.state-store-mode = "$mode"
akka.cluster.sharding.distributed-data.durable.lmdb {

View file

@ -135,7 +135,8 @@ abstract class ClusterShardingSpecConfig(val mode: String, val entityRecoveryStr
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.roles = ["backend"]
akka.cluster.distributed-data.gossip-interval = 1s
akka.persistence.journal.plugin = "akka.persistence.journal.leveldb-shared"

View file

@ -61,7 +61,8 @@ object MultiDcClusterShardingSpecConfig extends MultiNodeConfig {
akka.cluster {
debug.verbose-heartbeat-logging = on
debug.verbose-gossip-logging = on
auto-down-unreachable-after = 0s
downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 0s
sharding {
retry-interval = 200ms
}

View file

@ -44,7 +44,8 @@ abstract class MultiNodeClusterShardingConfig(
.withFallback(ConfigFactory.parseString(s"""
akka.loglevel = $loglevel
akka.actor.provider = "cluster"
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.remote.log-remote-lifecycle-events = off
akka.cluster.sharding.state-store-mode = "$mode"
akka.cluster.sharding.distributed-data.durable.lmdb {

View file

@ -36,7 +36,6 @@ import akka.pattern.ask
import akka.pattern.pipe
import akka.util.JavaDurationConverters._
import akka.util.Timeout
import com.github.ghik.silencer.silent
import com.typesafe.config.Config
object ClusterSingletonManagerSettings {
@ -45,7 +44,6 @@ object ClusterSingletonManagerSettings {
* Create settings from the default configuration
* `akka.cluster.singleton`.
*/
@silent("deprecated") // DownRemovalMargin
def apply(system: ActorSystem): ClusterSingletonManagerSettings =
apply(system.settings.config.getConfig("akka.cluster.singleton"))
// note that this setting has some additional logic inside the ClusterSingletonManager

View file

@ -45,7 +45,8 @@ object ClusterClientSpec extends MultiNodeConfig {
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.client.heartbeat-interval = 1s
akka.cluster.client.acceptable-heartbeat-pause = 3s
akka.cluster.client.refresh-contacts-interval = 1s

View file

@ -30,7 +30,8 @@ object DistributedPubSubMediatorSpec extends MultiNodeConfig {
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
akka.cluster.pub-sub.max-delta-elements = 500
"""))

View file

@ -33,7 +33,8 @@ object DistributedPubSubRestartSpec extends MultiNodeConfig {
akka.cluster.pub-sub.gossip-interval = 500ms
akka.actor.provider = cluster
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = off
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = off
"""))
testTransport(on = true)

View file

@ -35,7 +35,8 @@ object ClusterSingletonManagerChaosSpec extends MultiNodeConfig {
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
"""))
case object EchoStarted

View file

@ -28,7 +28,8 @@ object ClusterSingletonManagerLeaseSpec extends MultiNodeConfig {
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
test-lease {
lease-class = akka.cluster.TestLeaseActorClient
heartbeat-interval = 1s

View file

@ -33,7 +33,8 @@ object ClusterSingletonManagerLeave2Spec extends MultiNodeConfig {
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = off
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = off
"""))
case object EchoStarted

View file

@ -26,7 +26,8 @@ object ClusterSingletonManagerLeaveSpec extends MultiNodeConfig {
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = off
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = off
"""))
case object EchoStarted

View file

@ -40,7 +40,8 @@ object ClusterSingletonManagerSpec extends MultiNodeConfig {
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
"""))
nodeConfig(first, second, third, fourth, fifth, sixth)(ConfigFactory.parseString("akka.cluster.roles =[worker]"))

View file

@ -27,7 +27,8 @@ object ClusterSingletonManagerStartupSpec extends MultiNodeConfig {
akka.loglevel = INFO
akka.actor.provider = "cluster"
akka.remote.log-remote-lifecycle-events = off
akka.cluster.auto-down-unreachable-after = 0s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s
"""))
case object EchoStarted

View file

@ -44,7 +44,8 @@ class ClusterSingletonLeavingSpeedSpec
"""
akka.loglevel = DEBUG
akka.actor.provider = akka.cluster.ClusterActorRefProvider
akka.cluster.auto-down-unreachable-after = 2s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 2s
# With 10 systems and setting min-number-of-hand-over-retries to 5 and gossip-interval to 2s it's possible to
# reproduce the ClusterSingletonManagerIsStuck and slow hand over in issue #25639

View file

@ -31,7 +31,8 @@ class ClusterSingletonRestart2Spec
akka.loglevel = INFO
akka.cluster.roles = [singleton]
akka.actor.provider = akka.cluster.ClusterActorRefProvider
akka.cluster.auto-down-unreachable-after = 2s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 2s
akka.cluster.singleton.min-number-of-hand-over-retries = 5
akka.remote {
classic.netty.tcp {

View file

@ -14,10 +14,12 @@ import akka.testkit.TestActors
import akka.testkit.TestProbe
import com.typesafe.config.ConfigFactory
class ClusterSingletonRestartSpec extends AkkaSpec("""
class ClusterSingletonRestartSpec
extends AkkaSpec("""
akka.loglevel = INFO
akka.actor.provider = akka.cluster.ClusterActorRefProvider
akka.cluster.auto-down-unreachable-after = 2s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 2s
akka.remote {
classic.netty.tcp {
hostname = "127.0.0.1"

View file

@ -52,7 +52,6 @@ object ClusterReceptionistSpec {
}
akka.cluster {
#auto-down-unreachable-after = 0s
jmx.multi-mbeans-in-same-jvm = on
failure-detector.acceptable-heartbeat-pause = 3s
}

View file

@ -0,0 +1,8 @@
# #27788 Remove AutoDowning
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown$UnreachableTimeout")
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown$UnreachableTimeout$")
ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.ClusterSettings.AutoDownUnreachableAfter")
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDownBase")
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown$")
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDown")
ProblemFilters.exclude[MissingClassProblem]("akka.cluster.AutoDowning")

View file

@ -35,33 +35,17 @@ akka {
# attempts.
shutdown-after-unsuccessful-join-seed-nodes = off
# Should the 'leader' in the cluster be allowed to automatically mark
# unreachable nodes as DOWN after a configured time of unreachability?
# Using auto-down implies that two separate clusters will automatically be
# formed in case of network partition.
#
# Don't enable this in production, see 'Auto-downing (DO NOT USE)' section
# of Akka Cluster documentation.
#
# Disable with "off" or specify a duration to enable auto-down.
# If a downing-provider-class is configured this setting is ignored.
auto-down-unreachable-after = off
# Time margin after which shards or singletons that belonged to a downed/removed
# partition are created in surviving partition. The purpose of this margin is that
# in case of a network partition the persistent actors in the non-surviving partitions
# must be stopped before corresponding persistent actors are started somewhere else.
# This is useful if you implement downing strategies that handle network partitions,
# e.g. by keeping the larger side of the partition and shutting down the smaller side.
# It will not add any extra safety for auto-down-unreachable-after, since that is not
# handling network partitions.
# Disable with "off" or specify a duration to enable.
down-removal-margin = off
# Pluggable support for downing of nodes in the cluster.
# If this setting is left empty behavior will depend on 'auto-down-unreachable' in the following ways:
# * if it is 'off' the `NoDowning` provider is used and no automatic downing will be performed
# * if it is set to a duration the `AutoDowning` provider is with the configured downing duration
# If this setting is left empty the `NoDowning` provider is used and no automatic downing will be performed.
#
# If specified the value must be the fully qualified class name of a subclass of
# `akka.cluster.DowningProvider` having a public one argument constructor accepting an `ActorSystem`

View file

@ -125,8 +125,19 @@ class Cluster(val system: ExtendedActorSystem) extends Extension {
}
// needs to be lazy to allow downing provider impls to access Cluster (if not we get deadlock)
lazy val downingProvider: DowningProvider =
lazy val downingProvider: DowningProvider = {
checkAutoDownUsage()
DowningProvider.load(settings.DowningProviderClassName, system)
}
private def checkAutoDownUsage(): Unit = {
if (settings.DowningProviderClassName == "akka.cluster.AutoDowning" ||
(settings.config.hasPath("auto-down-unreachable-after") && settings.config.getString(
"auto-down-unreachable-after") != "off"))
logWarning(
"auto-down has been removed in Akka 2.6.0. See " +
"https://doc.akka.io/docs/akka/2.6/typed/cluster.html#downing for alternatives.")
}
// ========================================================
// ===================== WORK DAEMONS =====================

View file

@ -406,12 +406,17 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
override def preStart(): Unit = {
subscribeQuarantinedEvent()
cluster.downingProvider.downingActorProps.foreach { props =>
val propsWithDispatcher =
if (props.dispatcher == Deploy.NoDispatcherGiven) props.withDispatcher(context.props.dispatcher)
else props
cluster.downingProvider.downingActorProps match {
case Some(props) =>
val propsWithDispatcher =
if (props.dispatcher == Deploy.NoDispatcherGiven) props.withDispatcher(context.props.dispatcher)
else props
context.actorOf(propsWithDispatcher, name = "downingProvider")
context.actorOf(propsWithDispatcher, name = "downingProvider")
case None =>
logInfo(
"No downing-provider-class configured, manual cluster downing required, see " +
"https://doc.akka.io/docs/akka/current/typed/cluster.html#downing")
}
if (seedNodes.isEmpty) {
@ -420,7 +425,7 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
else
logInfo(
"No seed-nodes configured, manual cluster join required, see " +
"https://doc.akka.io/docs/akka/current/cluster-usage.html#joining-to-seed-nodes")
"https://doc.akka.io/docs/akka/current/typed/cluster.html#joining")
} else {
self ! JoinSeedNodes(seedNodes)
}

View file

@ -116,21 +116,6 @@ final class ClusterSettings(val config: Config, val systemName: String) {
cc.getMillisDuration(key).requiring(_ >= Duration.Zero, key + " >= 0s")
}
// specific to the [[akka.cluster.DefaultDowningProvider]]
val AutoDownUnreachableAfter: Duration = {
val key = "auto-down-unreachable-after"
toRootLowerCase(cc.getString(key)) match {
case "off" => Duration.Undefined
case _ => cc.getMillisDuration(key).requiring(_ >= Duration.Zero, key + " >= 0s, or off")
}
}
/**
* @deprecated Specific to [[akka.cluster.AutoDown]] should not be used anywhere else, instead
* ``Cluster.downingProvider.downRemovalMargin`` should be used as it allows the downing provider to decide removal
* margins
*/
@deprecated("Use Cluster.downingProvider.downRemovalMargin", since = "2.4.5")
val DownRemovalMargin: FiniteDuration = {
val key = "down-removal-margin"
toRootLowerCase(cc.getString(key)) match {
@ -142,7 +127,6 @@ final class ClusterSettings(val config: Config, val systemName: String) {
val DowningProviderClassName: String = {
val name = cc.getString("downing-provider-class")
if (name.nonEmpty) name
else if (AutoDownUnreachableAfter.isFinite) classOf[AutoDowning].getName
else classOf[NoDowning].getName
}

View file

@ -6,7 +6,6 @@ package akka.cluster
import akka.ConfigurationException
import akka.actor.{ ActorSystem, ExtendedActorSystem, Props }
import com.github.ghik.silencer.silent
import scala.concurrent.duration.FiniteDuration
@ -35,6 +34,15 @@ private[cluster] object DowningProvider {
/**
* API for plugins that will handle downing of cluster nodes. Concrete plugins must subclass and
* have a public one argument constructor accepting an [[akka.actor.ActorSystem]].
*
* A custom `DowningProvider` can be configured with `akka.cluster.downing-provider-class`
*
* When implementing a downing provider you should make sure that it will not split the cluster into
* several separate clusters in case of network problems or system overload (long GC pauses). This
* is much more difficult than it might be perceived at first, so carefully read the concerns and scenarios
* described in
* https://doc.akka.io/docs/akka/current/typed/cluster.html#downing and
* https://doc.akka.io/docs/akka-enhancements/current/split-brain-resolver.html
*/
abstract class DowningProvider {
@ -61,11 +69,9 @@ abstract class DowningProvider {
}
/**
* Default downing provider used when no provider is configured and 'auto-down-unreachable-after'
* is not enabled.
* Default downing provider used when no provider is configured.
*/
final class NoDowning(system: ActorSystem) extends DowningProvider {
@silent("deprecated")
override def downRemovalMargin: FiniteDuration = Cluster(system).settings.DownRemovalMargin
override val downingActorProps: Option[Props] = None
}

View file

@ -22,7 +22,8 @@ object LeaderDowningAllOtherNodesMultiJvmSpec extends MultiNodeConfig {
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster.failure-detector.monitored-by-nr-of-members = 2
akka.cluster.auto-down-unreachable-after = 1s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 1s
"""))
.withFallback(MultiNodeClusterSpec.clusterConfig))
}

View file

@ -21,7 +21,9 @@ final case class LeaderDowningNodeThatIsUnreachableMultiNodeConfig(failureDetect
commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("akka.cluster.auto-down-unreachable-after = 2s"))
.withFallback(ConfigFactory.parseString("""
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 2s"""))
.withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
}

View file

@ -21,7 +21,9 @@ object LeaderLeavingMultiJvmSpec extends MultiNodeConfig {
commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("akka.cluster.auto-down-unreachable-after = 0s"))
.withFallback(ConfigFactory.parseString("""
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 0s"""))
.withFallback(MultiNodeClusterSpec.clusterConfigWithFailureDetectorPuppet))
}

View file

@ -39,7 +39,8 @@ object MultiDcSplitBrainMultiJvmSpec extends MultiNodeConfig {
akka.cluster {
gossip-interval = 500ms
leader-actions-interval = 1s
auto-down-unreachable-after = 1s
downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 1s
}
""")
.withFallback(MultiNodeClusterSpec.clusterConfig))

View file

@ -21,8 +21,10 @@ object NodeChurnMultiJvmSpec extends MultiNodeConfig {
val third = role("third")
commonConfig(
debugConfig(on = false).withFallback(ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = 1s
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 1s
akka.cluster.prune-gossip-tombstones-after = 1s
akka.remote.classic.log-frame-size-exceeding = 1200b
akka.remote.artery.advanced.aeron {
@ -30,7 +32,8 @@ object NodeChurnMultiJvmSpec extends MultiNodeConfig {
embedded-media-driver = off
aeron-dir = "target/aeron-NodeChurnSpec"
}
""")).withFallback(MultiNodeClusterSpec.clusterConfig))
"""))
.withFallback(MultiNodeClusterSpec.clusterConfig))
class LogListener(testActor: ActorRef) extends Actor {
def receive = {

View file

@ -18,7 +18,7 @@ object NodeDowningAndBeingRemovedMultiJvmSpec extends MultiNodeConfig {
commonConfig(
debugConfig(on = false).withFallback(
ConfigFactory
.parseString("akka.cluster.auto-down-unreachable-after = off")
.parseString("akka.cluster.testkit.auto-down-unreachable-after = off")
.withFallback(MultiNodeClusterSpec.clusterConfig)))
}

View file

@ -21,10 +21,12 @@ object QuickRestartMultiJvmSpec extends MultiNodeConfig {
val third = role("third")
commonConfig(
debugConfig(on = false).withFallback(ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = off
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster.testkit.auto-down-unreachable-after = off
akka.cluster.allow-weakly-up-members = off
""")).withFallback(MultiNodeClusterSpec.clusterConfig))
"""))
.withFallback(MultiNodeClusterSpec.clusterConfig))
}

View file

@ -28,11 +28,13 @@ object RestartFirstSeedNodeMultiJvmSpec extends MultiNodeConfig {
val seed3 = role("seed3")
commonConfig(
debugConfig(on = false).withFallback(ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = off
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster.testkit.auto-down-unreachable-after = off
akka.cluster.retry-unsuccessful-join-after = 3s
akka.cluster.allow-weakly-up-members = off
""")).withFallback(MultiNodeClusterSpec.clusterConfig))
"""))
.withFallback(MultiNodeClusterSpec.clusterConfig))
}
class RestartFirstSeedNodeMultiJvmNode1 extends RestartFirstSeedNodeSpec

View file

@ -28,7 +28,8 @@ object RestartNode2SpecMultiJvmSpec extends MultiNodeConfig {
commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = 2s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 2s
akka.cluster.retry-unsuccessful-join-after = 3s
akka.cluster.allow-weakly-up-members = off
akka.remote.retry-gate-closed-for = 45s

View file

@ -29,7 +29,7 @@ object RestartNode3MultiJvmSpec extends MultiNodeConfig {
commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = off
akka.cluster.testkit.auto-down-unreachable-after = off
akka.cluster.allow-weakly-up-members = off
# test is using Java serialization and not priority to rewrite
akka.actor.allow-java-serialization = on

View file

@ -34,7 +34,8 @@ object RestartNodeMultiJvmSpec extends MultiNodeConfig {
commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = 5s
akka.cluster.downing-provider-class = akka.cluster.testkit.AutoDowning
akka.cluster.testkit.auto-down-unreachable-after = 5s
akka.cluster.allow-weakly-up-members = off
#akka.remote.use-passive-connections = off
# test is using Java serialization and not priority to rewrite

View file

@ -16,12 +16,16 @@ final case class SingletonClusterMultiNodeConfig(failureDetectorPuppet: Boolean)
val first = role("first")
val second = role("second")
commonConfig(debugConfig(on = false).withFallback(ConfigFactory.parseString("""
commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster {
auto-down-unreachable-after = 0s
downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 0s
failure-detector.threshold = 4
}
""")).withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
"""))
.withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
}

View file

@ -21,12 +21,16 @@ final case class SplitBrainMultiNodeConfig(failureDetectorPuppet: Boolean) exten
val fourth = role("fourth")
val fifth = role("fifth")
commonConfig(debugConfig(on = false).withFallback(ConfigFactory.parseString("""
commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.remote.retry-gate-closed-for = 3 s
akka.cluster {
auto-down-unreachable-after = 1s
downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 1s
failure-detector.threshold = 4
}""")).withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
}"""))
.withFallback(MultiNodeClusterSpec.clusterConfig(failureDetectorPuppet)))
testTransport(on = true)
}

View file

@ -34,10 +34,14 @@ object StreamRefSpec extends MultiNodeConfig {
val second = role("second")
val third = role("third")
commonConfig(debugConfig(on = false).withFallback(ConfigFactory.parseString("""
commonConfig(
debugConfig(on = false)
.withFallback(ConfigFactory.parseString("""
akka.cluster {
auto-down-unreachable-after = 1s
}""")).withFallback(MultiNodeClusterSpec.clusterConfig))
downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 1s
}"""))
.withFallback(MultiNodeClusterSpec.clusterConfig))
testTransport(on = true)

View file

@ -119,7 +119,8 @@ private[cluster] object StressMultiJvmSpec extends MultiNodeConfig {
akka.actor.provider = cluster
akka.cluster {
failure-detector.acceptable-heartbeat-pause = 10s
auto-down-unreachable-after = 1s
downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 1s
publish-stats-interval = 1s
}
akka.loggers = ["akka.testkit.TestEventListener"]

View file

@ -42,7 +42,6 @@ class ClusterConfigSpec extends AkkaSpec {
LeaderActionsInterval should ===(1 second)
UnreachableNodesReaperInterval should ===(1 second)
PublishStatsInterval should ===(Duration.Undefined)
AutoDownUnreachableAfter should ===(Duration.Undefined)
DownRemovalMargin should ===(Duration.Zero)
MinNrOfMembers should ===(1)
MinNrOfMembersOfRole should ===(Map.empty[String, Int])

View file

@ -11,7 +11,8 @@ import com.typesafe.config.{ Config, ConfigFactory }
object ClusterLogSpec {
val config = """
akka.cluster {
auto-down-unreachable-after = 0s
downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 0s
publish-stats-interval = 0 s # always, when it happens
failure-detector.implementation-class = akka.cluster.FailureDetectorPuppet
}

View file

@ -30,7 +30,8 @@ import scala.concurrent.duration._
object ClusterSpec {
val config = """
akka.cluster {
auto-down-unreachable-after = 0s
downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 0s
periodic-tasks-initial-delay = 120 seconds // turn off scheduled tasks
publish-stats-interval = 0 s # always, when it happens
failure-detector.implementation-class = akka.cluster.FailureDetectorPuppet

View file

@ -6,14 +6,17 @@ package akka.cluster
import java.util.concurrent.atomic.AtomicBoolean
import scala.concurrent.duration._
import akka.ConfigurationException
import akka.actor.{ ActorSystem, Props }
import akka.testkit.TestKit.{ awaitCond, shutdownActorSystem }
import akka.actor.ActorSystem
import akka.actor.Props
import akka.testkit.TestKit.awaitCond
import akka.testkit.TestKit.shutdownActorSystem
import akka.util.unused
import com.typesafe.config.ConfigFactory
import org.scalatest.{ Matchers, WordSpec }
import scala.concurrent.duration._
import org.scalatest.Matchers
import org.scalatest.WordSpec
class FailingDowningProvider(@unused system: ActorSystem) extends DowningProvider {
override val downRemovalMargin: FiniteDuration = 20.seconds
@ -39,6 +42,10 @@ class DowningProviderSpec extends WordSpec with Matchers {
loglevel = WARNING
actor.provider = "cluster"
remote {
artery.canonical {
hostname = 127.0.0.1
port = 0
}
classic.netty.tcp {
hostname = "127.0.0.1"
port = 0
@ -55,16 +62,6 @@ class DowningProviderSpec extends WordSpec with Matchers {
shutdownActorSystem(system)
}
"use akka.cluster.AutoDowning if 'auto-down-unreachable-after' is configured" in {
val system = ActorSystem(
"auto-downing",
ConfigFactory.parseString("""
akka.cluster.auto-down-unreachable-after = 18d
""").withFallback(baseConf))
Cluster(system).downingProvider shouldBe an[AutoDowning]
shutdownActorSystem(system)
}
"use the specified downing provider" in {
val system = ActorSystem(
"auto-downing",

View file

@ -259,7 +259,8 @@ class JoinConfigCompatCheckerSpec extends AkkaSpec with ClusterTestKit {
akka.cluster {
# using explicit downing provider class
downing-provider-class = "akka.cluster.AutoDowning"
downing-provider-class = "akka.cluster.testkit.AutoDowning"
testkit.auto-down-unreachable-after = 0s
configuration-compatibility-check {
enforce-on-join = on

View file

@ -2,17 +2,82 @@
* Copyright (C) 2009-2019 Lightbend Inc. <https://www.lightbend.com>
*/
package akka.cluster
import akka.ConfigurationException
import akka.actor.{ Actor, ActorSystem, Address, Cancellable, Props, Scheduler }
import scala.concurrent.duration.FiniteDuration
import akka.cluster.ClusterEvent._
package akka.cluster.testkit
import scala.concurrent.duration.Duration
import scala.concurrent.duration.FiniteDuration
import akka.actor.Actor
import akka.actor.ActorLogging
import com.github.ghik.silencer.silent
import akka.actor.ActorSystem
import akka.actor.Address
import akka.actor.Cancellable
import akka.actor.Props
import akka.actor.Scheduler
import akka.cluster.Cluster
import akka.cluster.ClusterEvent._
import akka.cluster.DowningProvider
import akka.cluster.Member
import akka.cluster.MembershipState
import akka.cluster.UniqueAddress
import akka.util.Helpers.ConfigOps
import akka.util.Helpers.Requiring
import akka.util.Helpers.toRootLowerCase
/**
* Downing provider used for testing.
*
* Auto-downing is a naïve approach to remove unreachable nodes from the cluster membership.
* In a production environment it will eventually break down the cluster.
* When a network partition occurs, both sides of the partition will see the other side as unreachable
* and remove it from the cluster. This results in the formation of two separate, disconnected, clusters
* (known as *Split Brain*).
*
* This behavior is not limited to network partitions. It can also occur if a node in the cluster is
* overloaded, or experiences a long GC pause.
*
* When using Cluster Singleton or Cluster Sharding it can break the contract provided by those features.
* Both provide a guarantee that an actor will be unique in a cluster.
* With the auto-down feature enabled, it is possible for multiple independent clusters to form (*Split Brain*).
* When this happens the guaranteed uniqueness will no longer be true resulting in undesirable behavior
* in the system.
*
* This is even more severe when Akka Persistence is used in conjunction with Cluster Sharding.
* In this case, the lack of unique actors can cause multiple actors to write to the same journal.
* Akka Persistence operates on a single writer principle. Having multiple writers will corrupt
* the journal and make it unusable.
*
* Finally, even if you don't use features such as Persistence, Sharding, or Singletons, auto-downing can lead the
* system to form multiple small clusters. These small clusters will be independent from each other. They will be
* unable to communicate and as a result you may experience performance degradation. Once this condition occurs,
* it will require manual intervention in order to reform the cluster.
*
* Because of these issues, auto-downing should never be used in a production environment.
*/
final class AutoDowning(system: ActorSystem) extends DowningProvider {
private def clusterSettings = Cluster(system).settings
private val AutoDownUnreachableAfter: Duration = {
val key = "akka.cluster.testkit.auto-down-unreachable-after"
// it's not in reference.conf, since only used in tests
if (clusterSettings.config.hasPath(key)) {
toRootLowerCase(clusterSettings.config.getString(key)) match {
case "off" => Duration.Undefined
case _ => clusterSettings.config.getMillisDuration(key).requiring(_ >= Duration.Zero, key + " >= 0s, or off")
}
} else
Duration.Undefined
}
override def downRemovalMargin: FiniteDuration = clusterSettings.DownRemovalMargin
override def downingActorProps: Option[Props] =
AutoDownUnreachableAfter match {
case d: FiniteDuration => Some(AutoDown.props(d))
case _ => None // auto-down-unreachable-after = off
}
}
/**
* INTERNAL API
@ -25,26 +90,6 @@ private[cluster] object AutoDown {
final case class UnreachableTimeout(node: UniqueAddress)
}
/**
* Used when no custom provider is configured and 'auto-down-unreachable-after' is enabled.
*/
final class AutoDowning(system: ActorSystem) extends DowningProvider {
private def clusterSettings = Cluster(system).settings
@silent("deprecated")
override def downRemovalMargin: FiniteDuration = clusterSettings.DownRemovalMargin
override def downingActorProps: Option[Props] =
clusterSettings.AutoDownUnreachableAfter match {
case d: FiniteDuration => Some(AutoDown.props(d))
case _ =>
// I don't think this can actually happen
throw new ConfigurationException(
"AutoDowning downing provider selected but 'akka.cluster.auto-down-unreachable-after' not set")
}
}
/**
* INTERNAL API
*
@ -68,9 +113,7 @@ private[cluster] class AutoDown(autoDownUnreachableAfter: FiniteDuration)
// re-subscribe when restart
override def preStart(): Unit = {
log.warning(
"Don't use auto-down feature of Akka Cluster in production. " +
"See 'Auto-downing (DO NOT USE)' section of Akka Cluster documentation.")
log.debug("Auto-down is enabled in test.")
cluster.subscribe(self, classOf[ClusterDomainEvent])
super.preStart()
}
@ -81,11 +124,7 @@ private[cluster] class AutoDown(autoDownUnreachableAfter: FiniteDuration)
override def down(node: Address): Unit = {
require(leader)
logInfo(
"Leader is auto-downing unreachable node [{}]. " +
"Don't use auto-down feature of Akka Cluster in production. " +
"See 'Auto-downing (DO NOT USE)' section of Akka Cluster documentation.",
node)
logInfo("Leader is auto-downing unreachable node [{}].", node)
cluster.down(node)
}

View file

@ -2,15 +2,18 @@
* Copyright (C) 2009-2019 Lightbend Inc. <https://www.lightbend.com>
*/
package akka.cluster
package akka.cluster.testkit
import scala.concurrent.duration._
import akka.actor.Address
import akka.actor.Scheduler
import akka.actor.ActorRef
import akka.actor.Address
import akka.actor.Props
import akka.cluster.MemberStatus._
import akka.actor.Scheduler
import akka.cluster.ClusterEvent._
import akka.cluster.Member
import akka.cluster.MemberStatus._
import akka.cluster.TestMember
import akka.remote.RARP
import akka.testkit.AkkaSpec
import akka.testkit.TimingTest

View file

@ -32,7 +32,8 @@ object LotsOfDataBot {
// Override the configuration of the port
val config = ConfigFactory
.parseString("akka.remote.classic.netty.tcp.port=" + port)
.withFallback(ConfigFactory.load(ConfigFactory.parseString("""
.withFallback(
ConfigFactory.load(ConfigFactory.parseString("""
passive = off
max-entries = 100000
akka.actor.provider = "cluster"
@ -48,7 +49,8 @@ object LotsOfDataBot {
"akka://ClusterSystem@127.0.0.1:2551",
"akka://ClusterSystem@127.0.0.1:2552"]
auto-down-unreachable-after = 10s
downing-provider-class = akka.cluster.testkit.AutoDowning
testkit.auto-down-unreachable-after = 10s
}
""")))

View file

@ -227,14 +227,6 @@ graceful leaving process of a cluster member.
See @ref:[removal of Internal Cluster Sharding Data](typed/cluster-sharding.md#removal-of-internal-cluster-sharding-data) in the documentation of the new APIs.
## Configuration
`ClusterShardingSettings` is a parameter to the `start` method of
the `ClusterSharding` extension, i.e. each each entity type can be configured with different settings
if needed.
See @ref:[configuration](typed/cluster-sharding.md#configuration) for more information.
## Inspecting cluster sharding state
Two requests to inspect the cluster state are available:
@ -256,20 +248,13 @@ directly sending messages to the individual entities.
## Lease
A @ref[lease](coordination.md) can be used as an additional safety measure to ensure a shard
does not run on two nodes.
A lease can be used as an additional safety measure to ensure a shard does not run on two nodes.
See @ref:[Lease](typed/cluster-sharding.md#lease) in the documentation of the new APIs.
Reasons for how this can happen:
## Configuration
* Network partitions without an appropriate downing provider
* Mistakes in the deployment process leading to two separate Akka Clusters
* Timing issues between removing members from the Cluster on one side of a network partition and shutting them down on the other side
`ClusterShardingSettings` is a parameter to the `start` method of
the `ClusterSharding` extension, i.e. each each entity type can be configured with different settings
if needed.
A lease can be a final backup that means that each shard won't create child entity actors unless it has the lease.
To use a lease for sharding set `akka.cluster.sharding.use-lease` to the configuration location
of the lease to use. Each shard will try and acquire a lease with with the name `<actor system name>-shard-<type name>-<shard id>` and
the owner is set to the `Cluster(system).selfAddress.hostPort`.
If a shard can't acquire a lease it will remain uninitialized so messages for entities it owns will
be buffered in the `ShardRegion`. If the lease is lost after initialization the Shard will be terminated.
See @ref:[configuration](typed/cluster-sharding.md#configuration) for more information.

View file

@ -104,6 +104,14 @@ Scala
Java
: @@snip [SimpleClusterListener2.java](/akka-docs/src/test/java/jdocs/cluster/SimpleClusterListener2.java) { #join }
## Leaving
See @ref:[Leaving](typed/cluster.md#leaving) in the documentation of the new APIs.
## Downing
See @ref:[Downing](typed/cluster.md#downing) in the documentation of the new APIs.
<a id="cluster-subscriber"></a>
## Subscribe to Cluster Events

View file

@ -3,7 +3,7 @@
## Commercial Support
Commercial support is provided by [Lightbend](http://www.lightbend.com).
Akka is part of the [Lightbend Reactive Platform](http://www.lightbend.com/platform).
Akka is part of the [Lightbend Platform](http://www.lightbend.com/platform).
## Sponsors

View file

@ -11,6 +11,40 @@ is [no longer available as a static method](https://github.com/scala/bug/issues/
If you are still using Scala 2.11 then you must upgrade to 2.12 or 2.13
## Auto-downing removed
Auto-downing of unreachable Cluster members have been removed after warnings and recommendations against using it
for many years. It was by default disabled, but could be enabled with configuration
`akka.cluster.auto-down-unreachable-after`.
For alternatives see the @ref:[documentation about Downing](../typed/cluster.md#downing).
Auto-downing was a naïve approach to remove unreachable nodes from the cluster membership.
In a production environment it will eventually break down the cluster.
When a network partition occurs, both sides of the partition will see the other side as unreachable
and remove it from the cluster. This results in the formation of two separate, disconnected, clusters
(known as *Split Brain*).
This behavior is not limited to network partitions. It can also occur if a node in the cluster is
overloaded, or experiences a long GC pause.
When using @ref:[Cluster Singleton](../typed/cluster-singleton.md) or @ref:[Cluster Sharding](../typed/cluster-sharding.md)
it can break the contract provided by those features. Both provide a guarantee that an actor will be unique in a cluster.
With the auto-down feature enabled, it is possible for multiple independent clusters to form (*Split Brain*).
When this happens the guaranteed uniqueness will no longer be true resulting in undesirable behavior in the system.
This is even more severe when @ref:[Akka Persistence](../typed/persistence.md) is used in conjunction with
Cluster Sharding. In this case, the lack of unique actors can cause multiple actors to write to the same journal.
Akka Persistence operates on a single writer principle. Having multiple writers will corrupt the journal
and make it unusable.
Finally, even if you don't use features such as Persistence, Sharding, or Singletons, auto-downing can lead the
system to form multiple small clusters. These small clusters will be independent from each other. They will be
unable to communicate and as a result you may experience performance degradation. Once this condition occurs,
it will require manual intervention in order to reform the cluster.
Because of these issues, auto-downing should **never** be used in a production environment.
## Removed features that were deprecated
After being deprecated since 2.5.0, the following have been removed in Akka 2.6.
@ -94,13 +128,25 @@ to make remote interactions look like local method calls.
Warnings about `TypedActor` have been [mentioned in documentation](https://doc.akka.io/docs/akka/2.5/typed-actors.html#when-to-use-typed-actors)
for many years.
### akka-protobuf
`akka-protobuf` was never intended to be used by end users but perhaps this was not well-documented.
Applications should use standard Protobuf dependency instead of `akka-protobuf`. The artifact is still
published, but the transitive dependency to `akka-protobuf` has been removed.
Akka is now using Protobuf version 3.9.0 for serialization of messages defined by Akka.
### Cluster Client
Cluster client has been deprecated as of 2.6 in favor of [Akka gRPC](https://doc.akka.io/docs/akka-grpc/current/index.html).
It is not advised to build new applications with Cluster client, and existing users @ref[should migrate to Akka gRPC](../cluster-client.md#migration-to-akka-grpc).
### akka.Main
`akka.Main` is deprecated in favour of starting the `ActorSystem` from a custom main class instead. `akka.Main` was not
adding much value and typically a custom main class is needed anyway.
@@ Remoting
## Remoting
### Default remoting is now Artery TCP
@ -184,20 +230,7 @@ For TCP:
Classic remoting is deprecated but can be used in `2.6.` Explicitly disable Artery by setting property `akka.remote.artery.enabled` to `false`. Further, any configuration under `akka.remote` that is
specific to classic remoting needs to be moved to `akka.remote.classic`. To see which configuration options
are specific to classic search for them in: [`akka-remote/reference.conf`](/akka-remote/src/main/resources/reference.conf)
### akka-protobuf
`akka-protobuf` was never intended to be used by end users but perhaps this was not well-documented.
Applications should use standard Protobuf dependency instead of `akka-protobuf`. The artifact is still
published, but the transitive dependency to `akka-protobuf` has been removed.
Akka is now using Protobuf version 3.9.0 for serialization of messages defined by Akka.
### Cluster Client
Cluster client has been deprecated as of 2.6 in favor of [Akka gRPC](https://doc.akka.io/docs/akka-grpc/current/index.html).
It is not advised to build new applications with Cluster client, and existing users @ref[should migrate to Akka gRPC](../cluster-client.md#migration-to-akka-grpc).
are specific to classic search for them in: @ref:[`akka-remote/reference.conf`](../general/configuration.md#config-akka-remote).
## Java Serialization
@ -235,14 +268,12 @@ handling that type and it was previously "accidentally" serialized with Java ser
The following documents configuration changes and behavior changes where no action is required. In some cases the old
behavior can be restored via configuration.
### Remoting
#### Remoting dependencies have been made optional
### Remoting dependencies have been made optional
Classic remoting depends on Netty and Artery UDP depends on Aeron. These are now both optional dependencies that need
to be explicitly added. See @ref[classic remoting](../remoting.md) or @ref[artery remoting](../remoting-artery.md) for instructions.
#### Remote watch and deployment have been disabled without Cluster use
### Remote watch and deployment have been disabled without Cluster use
By default, these remoting features are disabled when not using Akka Cluster:

View file

@ -43,10 +43,10 @@ if that feature is enabled.
@@@ warning
**Don't use Cluster Sharding together with Automatic Downing**,
since it allows the cluster to split up into two separate clusters, which in turn will result
in *multiple shards and entities* being started, one in each separate cluster!
See @ref:[Downing](cluster.md#automatic-vs-manual-downing).
Make sure to not use a Cluster downing strategy that may split the cluster into several separate clusters in
case of network problems or system overload (long GC pauses), since that will result in *multiple shards and entities*
being started, one in each separate cluster!
See @ref:[Downing](cluster.md#downing).
@@@
@ -304,6 +304,26 @@ rebalanced to other nodes.
See @ref:[How To Startup when Cluster Size Reached](cluster.md#how-to-startup-when-a-cluster-size-is-reached)
for more information about `min-nr-of-members`.
## Lease
A @ref[lease](../coordination.md) can be used as an additional safety measure to ensure a shard
does not run on two nodes.
Reasons for how this can happen:
* Network partitions without an appropriate downing provider
* Mistakes in the deployment process leading to two separate Akka Clusters
* Timing issues between removing members from the Cluster on one side of a network partition and shutting them down on the other side
A lease can be a final backup that means that each shard won't create child entity actors unless it has the lease.
To use a lease for sharding set `akka.cluster.sharding.use-lease` to the configuration location
of the lease to use. Each shard will try and acquire a lease with with the name `<actor system name>-shard-<type name>-<shard id>` and
the owner is set to the `Cluster(system).selfAddress.hostPort`.
If a shard can't acquire a lease it will remain uninitialized so messages for entities it owns will
be buffered in the `ShardRegion`. If the lease is lost after initialization the Shard will be terminated.
## Removal of internal Cluster Sharding data
Removal of internal Cluster Sharding data is only relevant for "Persistent Mode".
@ -326,15 +346,6 @@ cannot startup because of corrupt data, which may happen if accidentally
two clusters were running at the same time, e.g. caused by using auto-down
and there was a network partition.
@@@ warning
**Don't use Cluster Sharding together with Automatic Downing**,
since it allows the cluster to split up into two separate clusters, which in turn will result
in *multiple shards and entities* being started, one in each separate cluster!
See @ref:[Downing](cluster.md#automatic-vs-manual-downing).
@@@
Use this program as a standalone Java main program:
```
@ -347,7 +358,7 @@ The program is included in the `akka-cluster-sharding` jar file. It
is easiest to run it with same classpath and configuration as your ordinary
application. It can be run from sbt or Maven in similar way.
Specify the entity type names (same as you use in the `start` method
Specify the entity type names (same as you use in the `init` method
of `ClusterSharding`) as program arguments.
If you specify `-2.3` as the first program argument it will also try

View file

@ -32,6 +32,15 @@ such as single-point of bottleneck. Single-point of failure is also a relevant c
but for some cases this feature takes care of that by making sure that another singleton
instance will eventually be started.
@@@ warning
Make sure to not use a Cluster downing strategy that may split the cluster into several separate clusters in
case of network problems or system overload (long GC pauses), since that will result in in *multiple Singletons*
being started, one in each separate cluster!
See @ref:[Downing](cluster.md#downing).
@@@
### Singleton manager
The cluster singleton pattern manages one singleton actor instance among all cluster nodes or a group of nodes tagged with
@ -80,23 +89,20 @@ The singleton instance will not run on members with status @ref:[WeaklyUp](clust
This pattern may seem to be very tempting to use at first, but it has several drawbacks, some of them are listed below:
* the cluster singleton may quickly become a *performance bottleneck*,
* you can not rely on the cluster singleton to be *non-stop* available — e.g. when the node on which the singleton has
been running dies, it will take a few seconds for this to be noticed and the singleton be migrated to another node,
* in the case of a *network partition* appearing in a Cluster that is using Automatic Downing (see docs for
@ref:[Auto Downing](cluster.md#auto-downing-do-not-use),
it may happen that the isolated clusters each decide to spin up their own singleton, meaning that there might be multiple
singletons running in the system, yet the Clusters have no way of finding out about them (because of the partition).
Especially the last point is something you should be aware of — in general when using the Cluster Singleton pattern
you should take care of downing nodes yourself and not rely on the timing based auto-down feature.
* The cluster singleton may quickly become a *performance bottleneck*.
* You can not rely on the cluster singleton to be *non-stop* available — e.g. when the node on which the singleton
has been running dies, it will take a few seconds for this to be noticed and the singleton be migrated to another node.
* If many singletons are used be aware of that all will run on the oldest node (or oldest with configured role).
@ref:[Cluster Sharding](cluster-sharding.md) combined with keeping the "singleton" entities alive can be a better
alternative.
@@@ warning
**Don't use Cluster Singleton together with Automatic Downing**,
since it allows the cluster to split up into two separate clusters, which in turn will result
in *multiple Singletons* being started, one in each separate cluster!
Make sure to not use a Cluster downing strategy that may split the cluster into several separate clusters in
case of network problems or system overload (long GC pauses), since that will result in in *multiple Singletons*
being started, one in each separate cluster!
See @ref:[Downing](cluster.md#downing).
@@@
## Example

View file

@ -255,95 +255,69 @@ after the restart, when it come up as new incarnation of existing member in the
trying to join in, then the existing one will be removed from the cluster and then it will
be allowed to join.
<a id="automatic-vs-manual-downing"></a>
### Downing
When a member is considered by the failure detector to be `unreachable` the
leader is not allowed to perform its duties, such as changing status of
new joining members to 'Up'. The node must first become `reachable` again, or the
status of the unreachable member must be changed to 'Down'. Changing status to 'Down'
can be performed automatically or manually. By default it must be done manually, using
@ref:[JMX](../additional/operations.md#jmx) or @ref:[HTTP](../additional/operations.md#http).
It can also be performed programmatically with @scala[`Cluster(system).down(address)`]@java[`Cluster.get(system).down(address)`].
If a node is still running and sees its self as Down it will shutdown. @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will automatically
run if `run-coordinated-shutdown-when-down` is set to `on` (the default) however the node will not try
and leave the cluster gracefully so sharding and singleton migration will not occur.
A production solution for the downing problem is provided by
[Split Brain Resolver](http://developer.lightbend.com/docs/akka-commercial-addons/current/split-brain-resolver.html),
which is part of the [Lightbend Reactive Platform](http://www.lightbend.com/platform).
If you dont use RP, you should anyway carefully read the [documentation](http://developer.lightbend.com/docs/akka-commercial-addons/current/split-brain-resolver.html)
of the Split Brain Resolver and make sure that the solution you are using handles the concerns
described there.
### Auto-downing - DO NOT USE
There is an automatic downing feature that you should not use in production. For testing you can enable it with configuration:
```
akka.cluster.auto-down-unreachable-after = 120s
```
This means that the cluster leader member will change the `unreachable` node
status to `down` automatically after the configured time of unreachability.
This is a naïve approach to remove unreachable nodes from the cluster membership.
It can be useful during development but in a production environment it will eventually breakdown the cluster.
When a network partition occurs, both sides of the partition will see the other side as unreachable and remove it from the cluster.
This results in the formation of two separate, disconnected, clusters (known as *Split Brain*).
This behaviour is not limited to network partitions. It can also occur if a node
in the cluster is overloaded, or experiences a long GC pause.
@@@ warning
We recommend against using the auto-down feature of Akka Cluster in production. It
has multiple undesirable consequences for production systems.
If you are using @ref:[Cluster Singleton](cluster-singleton.md) or @ref:[Cluster Sharding](cluster-sharding.md) it can break the contract provided by
those features. Both provide a guarantee that an actor will be unique in a cluster.
With the auto-down feature enabled, it is possible for multiple independent clusters
to form (*Split Brain*). When this happens the guaranteed uniqueness will no
longer be true resulting in undesirable behaviour in the system.
This is even more severe when @ref:[Akka Persistence](persistence.md) is used in
conjunction with Cluster Sharding. In this case, the lack of unique actors can
cause multiple actors to write to the same journal. Akka Persistence operates on a
single writer principle. Having multiple writers will corrupt the journal
and make it unusable.
Finally, even if you don't use features such as Persistence, Sharding, or Singletons,
auto-downing can lead the system to form multiple small clusters. These small
clusters will be independent from each other. They will be unable to communicate
and as a result you may experience performance degradation. Once this condition
occurs, it will require manual intervention in order to reform the cluster.
Because of these issues, auto-downing should **never** be used in a production environment.
@@@
### Leaving
There are two ways to remove a member from the cluster.
There are a few ways to remove a member from the cluster.
1. The recommended way to leave a cluster is a graceful exit, informing the cluster that a node shall leave.
This can be performed using @ref:[JMX](../additional/operations.md#jmx) or @ref:[HTTP](../additional/operations.md#http).
This method will offer faster hand off to peer nodes during node shutdown.
1. When a graceful exit is not possible, you can stop the actor system (or the JVM process, for example a SIGTERM sent from the environment). It will be detected
as unreachable and removed after the automatic or manual downing.
1. The recommended way to leave a cluster is a graceful exit, informing the cluster that a node shall leave.
This is performed by @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) when the `ActorSystem`
is terminated and also when a SIGTERM is sent from the environment to stop the JVM process.
1. Graceful exit can also be performed using @ref:[HTTP](../additional/operations.md#http) or @ref:[JMX](../additional/operations.md#jmx).
1. When a graceful exit is not possible, for example in case of abrupt termination of the the JVM process, the node
will be detected as unreachable by other nodes and removed after @ref:[Downing](#downing).
The @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will automatically run when the cluster node sees itself as
Graceful leaving will offer faster hand off to peer nodes during node shutdown than abrupt termination and downing.
The @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will also run when the cluster node sees itself as
`Exiting`, i.e. leaving from another node will trigger the shutdown process on the leaving node.
Tasks for graceful leaving of cluster including graceful shutdown of Cluster Singletons and
Cluster Sharding are added automatically when Akka Cluster is used, i.e. running the shutdown
process will also trigger the graceful leaving if it's not already in progress.
Normally this is handled automatically, but in case of network failures during this process it might still
be necessary to set the nodes status to `Down` in order to complete the removal. For handling network failures
see [Split Brain Resolver](http://developer.lightbend.com/docs/akka-commercial-addons/current/split-brain-resolver.html),
part of the [Lightbend Reactive Platform](http://www.lightbend.com/platform).
be necessary to set the nodes status to `Down` in order to complete the removal, see @ref:[Downing](#downing).
### Downing
In many cases a member can gracefully exit from the cluster as described in @ref:[Leaving](#leaving), but
there are scenarios when an explicit downing decision is needed before it can be removed. For example in case
of abrupt termination of the the JVM process, system overload that doesn't recover, or network partitions
that don't heal. I such cases the node(s) will be detected as unreachable by other nodes, but they must also
be marked as `Down` before they are removed.
When a member is considered by the failure detector to be `unreachable` the
leader is not allowed to perform its duties, such as changing status of
new joining members to 'Up'. The node must first become `reachable` again, or the
status of the unreachable member must be changed to `Down`. Changing status to `Down`
can be performed automatically or manually.
By default, downing must be performed manually using @ref:[HTTP](../additional/operations.md#http) or @ref:[JMX](../additional/operations.md#jmx).
Note that @ref:[Cluster Singleton](cluster-singleton.md) or @ref:[Cluster Sharding entities](cluster-sharding.md) that
are running on a crashed (unreachable) node will not be started on another node until the previous node has
been removed from the Cluster. Removal of crashed (unreachable) nodes is performed after a downing decision.
A production solution for downing is provided by
[Split Brain Resolver](https://doc.akka.io/docs/akka-enhancements/current/split-brain-resolver.html),
which is part of the [Lightbend Platform](http://www.lightbend.com/platform).
If you dont have a Lightbend Platform Subscription, you should still carefully read the
[documentation](https://doc.akka.io/docs/akka-enhancements/current/split-brain-resolver.html)
of the Split Brain Resolver and make sure that the solution you are using handles the concerns and scenarios
described there.
A custom downing strategy can be implemented with a @apidoc[akka.cluster.DowningProvider] and enabled with
configuration `akka.cluster.downing-provider-class`.
Downing can also be performed programmatically with @scala[`Cluster(system).manager ! Down(address)`]@java[`Cluster.get(system).manager().tell(Down(address))`],
but that is mostly useful from tests and when implementing a `DowningProvider`.
If a crashed node is restarted with the same hostname and port and joining the cluster again the previous incarnation
of that member will be downed and removed. The new join attempt with same hostname and port is used as evidence
that the previous is not alive any more.
If a node is still running and sees its self as `Down` it will shutdown. @ref:[Coordinated Shutdown](../actors.md#coordinated-shutdown) will automatically
run if `run-coordinated-shutdown-when-down` is set to `on` (the default) however the node will not try
and leave the cluster gracefully.
## Node Roles