Merge pull request #722 from akka/wip-2251-doc-cluster-fd-patriknw
Describe cluster failure detector and phi, see #2251
This commit is contained in:
commit
4ff414bd3f
7 changed files with 65 additions and 14 deletions
|
|
@ -36,9 +36,6 @@ akka {
|
|||
# how often should the node send out gossip information?
|
||||
gossip-interval = 1s
|
||||
|
||||
# how often should the node send out heartbeats?
|
||||
heartbeat-interval = 1s
|
||||
|
||||
# how often should the leader perform maintenance tasks?
|
||||
leader-actions-interval = 1s
|
||||
|
||||
|
|
@ -76,6 +73,9 @@ akka {
|
|||
# akka.cluster.ClusterSettings parameters
|
||||
implementation-class = "akka.cluster.AccrualFailureDetector"
|
||||
|
||||
# how often should the node send out heartbeats?
|
||||
heartbeat-interval = 1s
|
||||
|
||||
# defines the failure detector threshold
|
||||
# A low threshold is prone to generate many wrong suspicions but ensures
|
||||
# a quick detection in the event of a real crash. Conversely, a high
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ class ClusterSettings(val config: Config, val systemName: String) {
|
|||
Duration(getMilliseconds("akka.cluster.failure-detector.min-std-deviation"), MILLISECONDS)
|
||||
final val FailureDetectorAcceptableHeartbeatPause: FiniteDuration =
|
||||
Duration(getMilliseconds("akka.cluster.failure-detector.acceptable-heartbeat-pause"), MILLISECONDS)
|
||||
final val HeartbeatInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.failure-detector.heartbeat-interval"), MILLISECONDS)
|
||||
|
||||
final val SeedNodes: IndexedSeq[Address] = getStringList("akka.cluster.seed-nodes").asScala.map {
|
||||
case AddressFromURIString(addr) ⇒ addr
|
||||
|
|
@ -30,7 +31,6 @@ class ClusterSettings(val config: Config, val systemName: String) {
|
|||
final val SeedNodeTimeout: FiniteDuration = Duration(getMilliseconds("akka.cluster.seed-node-timeout"), MILLISECONDS)
|
||||
final val PeriodicTasksInitialDelay: FiniteDuration = Duration(getMilliseconds("akka.cluster.periodic-tasks-initial-delay"), MILLISECONDS)
|
||||
final val GossipInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.gossip-interval"), MILLISECONDS)
|
||||
final val HeartbeatInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.heartbeat-interval"), MILLISECONDS)
|
||||
final val LeaderActionsInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.leader-actions-interval"), MILLISECONDS)
|
||||
final val UnreachableNodesReaperInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.unreachable-nodes-reaper-interval"), MILLISECONDS)
|
||||
final val PublishStatsInterval: FiniteDuration = Duration(getMilliseconds("akka.cluster.publish-stats-interval"), MILLISECONDS)
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ import com.typesafe.config.Config
|
|||
import com.typesafe.config.ConfigFactory
|
||||
import akka.actor.{ Address, ExtendedActorSystem }
|
||||
import akka.remote.testconductor.RoleName
|
||||
import akka.remote.testkit.{STMultiNodeSpec, MultiNodeSpec}
|
||||
import akka.remote.testkit.{ STMultiNodeSpec, MultiNodeSpec }
|
||||
import akka.testkit._
|
||||
import scala.concurrent.util.duration._
|
||||
import scala.concurrent.util.Duration
|
||||
|
|
@ -31,15 +31,15 @@ object MultiNodeClusterSpec {
|
|||
def clusterConfig: Config = ConfigFactory.parseString("""
|
||||
akka.actor.provider = akka.cluster.ClusterActorRefProvider
|
||||
akka.cluster {
|
||||
auto-join = on
|
||||
auto-down = off
|
||||
jmx.enabled = off
|
||||
gossip-interval = 200 ms
|
||||
heartbeat-interval = 400 ms
|
||||
leader-actions-interval = 200 ms
|
||||
unreachable-nodes-reaper-interval = 200 ms
|
||||
periodic-tasks-initial-delay = 300 ms
|
||||
publish-stats-interval = 0 s # always, when it happens
|
||||
auto-join = on
|
||||
auto-down = off
|
||||
jmx.enabled = off
|
||||
gossip-interval = 200 ms
|
||||
leader-actions-interval = 200 ms
|
||||
unreachable-nodes-reaper-interval = 200 ms
|
||||
periodic-tasks-initial-delay = 300 ms
|
||||
publish-stats-interval = 0 s # always, when it happens
|
||||
failure-detector.heartbeat-interval = 400 ms
|
||||
}
|
||||
akka.remote.log-remote-lifecycle-events = off
|
||||
akka.test {
|
||||
|
|
|
|||
|
|
@ -214,6 +214,57 @@ frontend nodes and 3 backend nodes::
|
|||
|
||||
.. note:: The above example should probably be designed as two separate, frontend/backend, clusters, when there is a `cluster client for decoupling clusters <https://www.assembla.com/spaces/akka/tickets/1165>`_.
|
||||
|
||||
Failure Detector
|
||||
^^^^^^^^^^^^^^^^
|
||||
|
||||
The nodes in the cluster monitor each other by sending heartbeats to detect if a node is
|
||||
unreachable from the rest of the cluster. The heartbeat arrival times is interpreted
|
||||
by an implementation of
|
||||
`The Phi Accrual Failure Detector <http://ddg.jaist.ac.jp/pub/HDY+04.pdf>`_.
|
||||
|
||||
The suspicion level of failure is given by a value called *phi*.
|
||||
The basic idea of the phi failure detector is to express the value of *phi* on a scale that
|
||||
is dynamically adjusted to reflect current network conditions.
|
||||
|
||||
The value of *phi* is calculated as::
|
||||
|
||||
phi = -log10(1 - F(timeSinceLastHeartbeat)
|
||||
|
||||
where F is the cumulative distribution function of a normal distribution with mean
|
||||
and standard deviation estimated from historical heartbeat inter-arrival times.
|
||||
|
||||
In the :ref:`cluster_configuration` you can adjust the ``akka.cluster.failure-detector.threshold``
|
||||
to define when a *phi* value is considered to be a failure.
|
||||
|
||||
A low ``threshold`` is prone to generate many false positives but ensures
|
||||
a quick detection in the event of a real crash. Conversely, a high ``threshold``
|
||||
generates fewer mistakes but needs more time to detect actual crashes. The
|
||||
default ``threshold`` is 8 and is appropriate for most situations. However in
|
||||
cloud environments, such as Amazon EC2, the value could be increased to 12 in
|
||||
order to account for network issues that sometimes occur on such platforms.
|
||||
|
||||
The following chart illustrates how *phi* increase with increasing time since the
|
||||
previous heartbeat.
|
||||
|
||||
.. image:: images/phi1.png
|
||||
|
||||
Phi is calculated from the mean and standard deviation of historical
|
||||
inter arrival times. The previous chart is an example for standard deviation
|
||||
of 200 ms. If the heartbeats arrive with less deviation the curve becomes steeper,
|
||||
i.e. it's possible to determine failure more quickly. The curve looks like this for
|
||||
a standard deviation of 100 ms.
|
||||
|
||||
.. image:: images/phi2.png
|
||||
|
||||
To be able to survive sudden abnormalities, such as garbage collection pauses and
|
||||
transient network failures the failure detector is configured with a margin,
|
||||
``akka.cluster.failure-detector.acceptable-heartbeat-pause``. You may want to
|
||||
adjust the :ref:`cluster_configuration` of this depending on you environment.
|
||||
This is how the curve looks like for ``acceptable-heartbeat-pause`` configured to
|
||||
3 seconds.
|
||||
|
||||
.. image:: images/phi3.png
|
||||
|
||||
Cluster Aware Routers
|
||||
^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
|
|
|||
BIN
akka-docs/rst/cluster/images/phi1.png
Normal file
BIN
akka-docs/rst/cluster/images/phi1.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 26 KiB |
BIN
akka-docs/rst/cluster/images/phi2.png
Normal file
BIN
akka-docs/rst/cluster/images/phi2.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
BIN
akka-docs/rst/cluster/images/phi3.png
Normal file
BIN
akka-docs/rst/cluster/images/phi3.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 26 KiB |
Loading…
Add table
Add a link
Reference in a new issue