diff --git a/akka-cluster/src/main/resources/reference.conf b/akka-cluster/src/main/resources/reference.conf index fa6860a1a8..e2fe6cc8d1 100644 --- a/akka-cluster/src/main/resources/reference.conf +++ b/akka-cluster/src/main/resources/reference.conf @@ -36,9 +36,6 @@ akka { # how often should the node send out gossip information? gossip-interval = 1s - # how often should the node send out heartbeats? - heartbeat-interval = 1s - # how often should the leader perform maintenance tasks? leader-actions-interval = 1s @@ -76,6 +73,9 @@ akka { # akka.cluster.ClusterSettings parameters implementation-class = "akka.cluster.AccrualFailureDetector" + # how often should the node send out heartbeats? + heartbeat-interval = 1s + # defines the failure detector threshold # A low threshold is prone to generate many wrong suspicions but ensures # a quick detection in the event of a real crash. Conversely, a high diff --git a/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala b/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala index 4212e59c1c..3c7baa4f76 100644 --- a/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala +++ b/akka-cluster/src/main/scala/akka/cluster/ClusterSettings.scala @@ -23,6 +23,7 @@ class ClusterSettings(val config: Config, val systemName: String) { Duration(getMilliseconds("akka.cluster.failure-detector.min-std-deviation"), MILLISECONDS) final val FailureDetectorAcceptableHeartbeatPause: FiniteDuration = Duration(getMilliseconds("akka.cluster.failure-detector.acceptable-heartbeat-pause"), MILLISECONDS) + final val HeartbeatInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.failure-detector.heartbeat-interval"), MILLISECONDS) final val SeedNodes: IndexedSeq[Address] = getStringList("akka.cluster.seed-nodes").asScala.map { case AddressFromURIString(addr) ⇒ addr @@ -30,7 +31,6 @@ class ClusterSettings(val config: Config, val systemName: String) { final val SeedNodeTimeout: FiniteDuration = Duration(getMilliseconds("akka.cluster.seed-node-timeout"), MILLISECONDS) final val PeriodicTasksInitialDelay: FiniteDuration = Duration(getMilliseconds("akka.cluster.periodic-tasks-initial-delay"), MILLISECONDS) final val GossipInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.gossip-interval"), MILLISECONDS) - final val HeartbeatInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.heartbeat-interval"), MILLISECONDS) final val LeaderActionsInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.leader-actions-interval"), MILLISECONDS) final val UnreachableNodesReaperInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.unreachable-nodes-reaper-interval"), MILLISECONDS) final val PublishStatsInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.publish-stats-interval"), MILLISECONDS) diff --git a/akka-cluster/src/multi-jvm/scala/akka/cluster/MultiNodeClusterSpec.scala b/akka-cluster/src/multi-jvm/scala/akka/cluster/MultiNodeClusterSpec.scala index f38d80ace5..68ed7d91e7 100644 --- a/akka-cluster/src/multi-jvm/scala/akka/cluster/MultiNodeClusterSpec.scala +++ b/akka-cluster/src/multi-jvm/scala/akka/cluster/MultiNodeClusterSpec.scala @@ -8,7 +8,7 @@ import com.typesafe.config.Config import com.typesafe.config.ConfigFactory import akka.actor.{ Address, ExtendedActorSystem } import akka.remote.testconductor.RoleName -import akka.remote.testkit.{STMultiNodeSpec, MultiNodeSpec} +import akka.remote.testkit.{ STMultiNodeSpec, MultiNodeSpec } import akka.testkit._ import scala.concurrent.util.duration._ import scala.concurrent.util.Duration @@ -31,15 +31,15 @@ object MultiNodeClusterSpec { def clusterConfig: Config = ConfigFactory.parseString(""" akka.actor.provider = akka.cluster.ClusterActorRefProvider akka.cluster { - auto-join = on - auto-down = off - jmx.enabled = off - gossip-interval = 200 ms - heartbeat-interval = 400 ms - leader-actions-interval = 200 ms - unreachable-nodes-reaper-interval = 200 ms - periodic-tasks-initial-delay = 300 ms - publish-stats-interval = 0 s # always, when it happens + auto-join = on + auto-down = off + jmx.enabled = off + gossip-interval = 200 ms + leader-actions-interval = 200 ms + unreachable-nodes-reaper-interval = 200 ms + periodic-tasks-initial-delay = 300 ms + publish-stats-interval = 0 s # always, when it happens + failure-detector.heartbeat-interval = 400 ms } akka.remote.log-remote-lifecycle-events = off akka.test { diff --git a/akka-docs/rst/cluster/cluster-usage.rst b/akka-docs/rst/cluster/cluster-usage.rst index 3c8b112bef..12be1c238c 100644 --- a/akka-docs/rst/cluster/cluster-usage.rst +++ b/akka-docs/rst/cluster/cluster-usage.rst @@ -214,6 +214,57 @@ frontend nodes and 3 backend nodes:: .. note:: The above example should probably be designed as two separate, frontend/backend, clusters, when there is a `cluster client for decoupling clusters `_. +Failure Detector +^^^^^^^^^^^^^^^^ + +The nodes in the cluster monitor each other by sending heartbeats to detect if a node is +unreachable from the rest of the cluster. The heartbeat arrival times is interpreted +by an implementation of +`The Phi Accrual Failure Detector `_. + +The suspicion level of failure is given by a value called *phi*. +The basic idea of the phi failure detector is to express the value of *phi* on a scale that +is dynamically adjusted to reflect current network conditions. + +The value of *phi* is calculated as:: + + phi = -log10(1 - F(timeSinceLastHeartbeat) + +where F is the cumulative distribution function of a normal distribution with mean +and standard deviation estimated from historical heartbeat inter-arrival times. + +In the :ref:`cluster_configuration` you can adjust the ``akka.cluster.failure-detector.threshold`` +to define when a *phi* value is considered to be a failure. + +A low ``threshold`` is prone to generate many false positives but ensures +a quick detection in the event of a real crash. Conversely, a high ``threshold`` +generates fewer mistakes but needs more time to detect actual crashes. The +default ``threshold`` is 8 and is appropriate for most situations. However in +cloud environments, such as Amazon EC2, the value could be increased to 12 in +order to account for network issues that sometimes occur on such platforms. + +The following chart illustrates how *phi* increase with increasing time since the +previous heartbeat. + +.. image:: images/phi1.png + +Phi is calculated from the mean and standard deviation of historical +inter arrival times. The previous chart is an example for standard deviation +of 200 ms. If the heartbeats arrive with less deviation the curve becomes steeper, +i.e. it's possible to determine failure more quickly. The curve looks like this for +a standard deviation of 100 ms. + +.. image:: images/phi2.png + +To be able to survive sudden abnormalities, such as garbage collection pauses and +transient network failures the failure detector is configured with a margin, +``akka.cluster.failure-detector.acceptable-heartbeat-pause``. You may want to +adjust the :ref:`cluster_configuration` of this depending on you environment. +This is how the curve looks like for ``acceptable-heartbeat-pause`` configured to +3 seconds. + +.. image:: images/phi3.png + Cluster Aware Routers ^^^^^^^^^^^^^^^^^^^^^ diff --git a/akka-docs/rst/cluster/images/phi1.png b/akka-docs/rst/cluster/images/phi1.png new file mode 100644 index 0000000000..104068ec54 Binary files /dev/null and b/akka-docs/rst/cluster/images/phi1.png differ diff --git a/akka-docs/rst/cluster/images/phi2.png b/akka-docs/rst/cluster/images/phi2.png new file mode 100644 index 0000000000..af2e756991 Binary files /dev/null and b/akka-docs/rst/cluster/images/phi2.png differ diff --git a/akka-docs/rst/cluster/images/phi3.png b/akka-docs/rst/cluster/images/phi3.png new file mode 100644 index 0000000000..bda3c5d345 Binary files /dev/null and b/akka-docs/rst/cluster/images/phi3.png differ