Merge pull request #722 from akka/wip-2251-doc-cluster-fd-patriknw
Describe cluster failure detector and phi, see #2251
This commit is contained in:
commit
4ff414bd3f
7 changed files with 65 additions and 14 deletions
|
|
@ -36,9 +36,6 @@ akka {
|
||||||
# how often should the node send out gossip information?
|
# how often should the node send out gossip information?
|
||||||
gossip-interval = 1s
|
gossip-interval = 1s
|
||||||
|
|
||||||
# how often should the node send out heartbeats?
|
|
||||||
heartbeat-interval = 1s
|
|
||||||
|
|
||||||
# how often should the leader perform maintenance tasks?
|
# how often should the leader perform maintenance tasks?
|
||||||
leader-actions-interval = 1s
|
leader-actions-interval = 1s
|
||||||
|
|
||||||
|
|
@ -76,6 +73,9 @@ akka {
|
||||||
# akka.cluster.ClusterSettings parameters
|
# akka.cluster.ClusterSettings parameters
|
||||||
implementation-class = "akka.cluster.AccrualFailureDetector"
|
implementation-class = "akka.cluster.AccrualFailureDetector"
|
||||||
|
|
||||||
|
# how often should the node send out heartbeats?
|
||||||
|
heartbeat-interval = 1s
|
||||||
|
|
||||||
# defines the failure detector threshold
|
# defines the failure detector threshold
|
||||||
# A low threshold is prone to generate many wrong suspicions but ensures
|
# A low threshold is prone to generate many wrong suspicions but ensures
|
||||||
# a quick detection in the event of a real crash. Conversely, a high
|
# a quick detection in the event of a real crash. Conversely, a high
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ class ClusterSettings(val config: Config, val systemName: String) {
|
||||||
Duration(getMilliseconds("akka.cluster.failure-detector.min-std-deviation"), MILLISECONDS)
|
Duration(getMilliseconds("akka.cluster.failure-detector.min-std-deviation"), MILLISECONDS)
|
||||||
final val FailureDetectorAcceptableHeartbeatPause: FiniteDuration =
|
final val FailureDetectorAcceptableHeartbeatPause: FiniteDuration =
|
||||||
Duration(getMilliseconds("akka.cluster.failure-detector.acceptable-heartbeat-pause"), MILLISECONDS)
|
Duration(getMilliseconds("akka.cluster.failure-detector.acceptable-heartbeat-pause"), MILLISECONDS)
|
||||||
|
final val HeartbeatInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.failure-detector.heartbeat-interval"), MILLISECONDS)
|
||||||
|
|
||||||
final val SeedNodes: IndexedSeq[Address] = getStringList("akka.cluster.seed-nodes").asScala.map {
|
final val SeedNodes: IndexedSeq[Address] = getStringList("akka.cluster.seed-nodes").asScala.map {
|
||||||
case AddressFromURIString(addr) ⇒ addr
|
case AddressFromURIString(addr) ⇒ addr
|
||||||
|
|
@ -30,7 +31,6 @@ class ClusterSettings(val config: Config, val systemName: String) {
|
||||||
final val SeedNodeTimeout: FiniteDuration = Duration(getMilliseconds("akka.cluster.seed-node-timeout"), MILLISECONDS)
|
final val SeedNodeTimeout: FiniteDuration = Duration(getMilliseconds("akka.cluster.seed-node-timeout"), MILLISECONDS)
|
||||||
final val PeriodicTasksInitialDelay: FiniteDuration = Duration(getMilliseconds("akka.cluster.periodic-tasks-initial-delay"), MILLISECONDS)
|
final val PeriodicTasksInitialDelay: FiniteDuration = Duration(getMilliseconds("akka.cluster.periodic-tasks-initial-delay"), MILLISECONDS)
|
||||||
final val GossipInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.gossip-interval"), MILLISECONDS)
|
final val GossipInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.gossip-interval"), MILLISECONDS)
|
||||||
final val HeartbeatInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.heartbeat-interval"), MILLISECONDS)
|
|
||||||
final val LeaderActionsInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.leader-actions-interval"), MILLISECONDS)
|
final val LeaderActionsInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.leader-actions-interval"), MILLISECONDS)
|
||||||
final val UnreachableNodesReaperInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.unreachable-nodes-reaper-interval"), MILLISECONDS)
|
final val UnreachableNodesReaperInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.unreachable-nodes-reaper-interval"), MILLISECONDS)
|
||||||
final val PublishStatsInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.publish-stats-interval"), MILLISECONDS)
|
final val PublishStatsInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.publish-stats-interval"), MILLISECONDS)
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ import com.typesafe.config.Config
|
||||||
import com.typesafe.config.ConfigFactory
|
import com.typesafe.config.ConfigFactory
|
||||||
import akka.actor.{ Address, ExtendedActorSystem }
|
import akka.actor.{ Address, ExtendedActorSystem }
|
||||||
import akka.remote.testconductor.RoleName
|
import akka.remote.testconductor.RoleName
|
||||||
import akka.remote.testkit.{STMultiNodeSpec, MultiNodeSpec}
|
import akka.remote.testkit.{ STMultiNodeSpec, MultiNodeSpec }
|
||||||
import akka.testkit._
|
import akka.testkit._
|
||||||
import scala.concurrent.util.duration._
|
import scala.concurrent.util.duration._
|
||||||
import scala.concurrent.util.Duration
|
import scala.concurrent.util.Duration
|
||||||
|
|
@ -35,11 +35,11 @@ object MultiNodeClusterSpec {
|
||||||
auto-down = off
|
auto-down = off
|
||||||
jmx.enabled = off
|
jmx.enabled = off
|
||||||
gossip-interval = 200 ms
|
gossip-interval = 200 ms
|
||||||
heartbeat-interval = 400 ms
|
|
||||||
leader-actions-interval = 200 ms
|
leader-actions-interval = 200 ms
|
||||||
unreachable-nodes-reaper-interval = 200 ms
|
unreachable-nodes-reaper-interval = 200 ms
|
||||||
periodic-tasks-initial-delay = 300 ms
|
periodic-tasks-initial-delay = 300 ms
|
||||||
publish-stats-interval = 0 s # always, when it happens
|
publish-stats-interval = 0 s # always, when it happens
|
||||||
|
failure-detector.heartbeat-interval = 400 ms
|
||||||
}
|
}
|
||||||
akka.remote.log-remote-lifecycle-events = off
|
akka.remote.log-remote-lifecycle-events = off
|
||||||
akka.test {
|
akka.test {
|
||||||
|
|
|
||||||
|
|
@ -214,6 +214,57 @@ frontend nodes and 3 backend nodes::
|
||||||
|
|
||||||
.. note:: The above example should probably be designed as two separate, frontend/backend, clusters, when there is a `cluster client for decoupling clusters <https://www.assembla.com/spaces/akka/tickets/1165>`_.
|
.. note:: The above example should probably be designed as two separate, frontend/backend, clusters, when there is a `cluster client for decoupling clusters <https://www.assembla.com/spaces/akka/tickets/1165>`_.
|
||||||
|
|
||||||
|
Failure Detector
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The nodes in the cluster monitor each other by sending heartbeats to detect if a node is
|
||||||
|
unreachable from the rest of the cluster. The heartbeat arrival times is interpreted
|
||||||
|
by an implementation of
|
||||||
|
`The Phi Accrual Failure Detector <http://ddg.jaist.ac.jp/pub/HDY+04.pdf>`_.
|
||||||
|
|
||||||
|
The suspicion level of failure is given by a value called *phi*.
|
||||||
|
The basic idea of the phi failure detector is to express the value of *phi* on a scale that
|
||||||
|
is dynamically adjusted to reflect current network conditions.
|
||||||
|
|
||||||
|
The value of *phi* is calculated as::
|
||||||
|
|
||||||
|
phi = -log10(1 - F(timeSinceLastHeartbeat)
|
||||||
|
|
||||||
|
where F is the cumulative distribution function of a normal distribution with mean
|
||||||
|
and standard deviation estimated from historical heartbeat inter-arrival times.
|
||||||
|
|
||||||
|
In the :ref:`cluster_configuration` you can adjust the ``akka.cluster.failure-detector.threshold``
|
||||||
|
to define when a *phi* value is considered to be a failure.
|
||||||
|
|
||||||
|
A low ``threshold`` is prone to generate many false positives but ensures
|
||||||
|
a quick detection in the event of a real crash. Conversely, a high ``threshold``
|
||||||
|
generates fewer mistakes but needs more time to detect actual crashes. The
|
||||||
|
default ``threshold`` is 8 and is appropriate for most situations. However in
|
||||||
|
cloud environments, such as Amazon EC2, the value could be increased to 12 in
|
||||||
|
order to account for network issues that sometimes occur on such platforms.
|
||||||
|
|
||||||
|
The following chart illustrates how *phi* increase with increasing time since the
|
||||||
|
previous heartbeat.
|
||||||
|
|
||||||
|
.. image:: images/phi1.png
|
||||||
|
|
||||||
|
Phi is calculated from the mean and standard deviation of historical
|
||||||
|
inter arrival times. The previous chart is an example for standard deviation
|
||||||
|
of 200 ms. If the heartbeats arrive with less deviation the curve becomes steeper,
|
||||||
|
i.e. it's possible to determine failure more quickly. The curve looks like this for
|
||||||
|
a standard deviation of 100 ms.
|
||||||
|
|
||||||
|
.. image:: images/phi2.png
|
||||||
|
|
||||||
|
To be able to survive sudden abnormalities, such as garbage collection pauses and
|
||||||
|
transient network failures the failure detector is configured with a margin,
|
||||||
|
``akka.cluster.failure-detector.acceptable-heartbeat-pause``. You may want to
|
||||||
|
adjust the :ref:`cluster_configuration` of this depending on you environment.
|
||||||
|
This is how the curve looks like for ``acceptable-heartbeat-pause`` configured to
|
||||||
|
3 seconds.
|
||||||
|
|
||||||
|
.. image:: images/phi3.png
|
||||||
|
|
||||||
Cluster Aware Routers
|
Cluster Aware Routers
|
||||||
^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
|
|
||||||
BIN
akka-docs/rst/cluster/images/phi1.png
Normal file
BIN
akka-docs/rst/cluster/images/phi1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 26 KiB |
BIN
akka-docs/rst/cluster/images/phi2.png
Normal file
BIN
akka-docs/rst/cluster/images/phi2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
BIN
akka-docs/rst/cluster/images/phi3.png
Normal file
BIN
akka-docs/rst/cluster/images/phi3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 26 KiB |
Loading…
Add table
Add a link
Reference in a new issue